From 48b1e4e6523af9af5c63509d803d148d49edcf36 Mon Sep 17 00:00:00 2001
From: neillu23 <neilyenjulu@gmail.com>
Date: Mon, 23 Jan 2023 17:25:18 -0500
Subject: [PATCH 01/89] commonvoice speech recognition recipe

---
 egs/commonvoice/v1/cmd.sh                     |  28 ++
 egs/commonvoice/v1/conf/clsp.conf             |  11 +
 egs/commonvoice/v1/conf/coe_gpu_bigmem.conf   |  11 +
 egs/commonvoice/v1/conf/coe_gpu_long.conf     |  13 +
 egs/commonvoice/v1/conf/coe_gpu_rtx.conf      |  11 +
 egs/commonvoice/v1/conf/coe_gpu_short.conf    |  11 +
 egs/commonvoice/v1/conf/coe_gpu_v100.conf     |  11 +
 egs/commonvoice/v1/conf/reverb_noise_aug.yaml |  35 ++
 ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v2.0.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v3.0.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v3.1.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v3.2.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v4.3.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v4.4.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage2_v1.0.yaml |  56 +++
 ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml |  61 +++
 ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml |  61 +++
 .../v1/conf/wav2vec2xlsr300m_transducer.yaml  |  14 +
 .../conf/wav2vec2xlsr300m_transducer_do.yaml  |  13 +
 .../wav2vec2xlsr300m_transducer_do0.2.yaml    |  13 +
 .../wav2vec2xlsr300m_transducer_do0.3.yaml    |  13 +
 .../wav2vec2xlsr300m_transducer_do0.4.yaml    |  13 +
 .../wav2vec2xlsr300m_transducer_enclast.yaml  |  11 +
 egs/commonvoice/v1/datapath.sh                |  22 +
 egs/commonvoice/v1/default_config.sh          |   1 +
 egs/commonvoice/v1/feats                      |   1 +
 .../v1/global_conf/config_transducer_v3.3.sh  |  39 ++
 .../global_conf/config_transducer_v3.3_it.sh  |  41 ++
 egs/commonvoice/v1/hyp_utils                  |   1 +
 egs/commonvoice/v1/local/data_prep.sh         |  33 ++
 egs/commonvoice/v1/local/make_musan.py        | 189 ++++++++
 egs/commonvoice/v1/local/make_musan.sh        |  48 ++
 egs/commonvoice/v1/local/make_rirs_data.sh    |  29 ++
 egs/commonvoice/v1/local/prepare_lang.py      | 410 ++++++++++++++++++
 egs/commonvoice/v1/local/prepare_lang_bpe.py  | 259 +++++++++++
 egs/commonvoice/v1/local/train_bpe_model.py   |  97 +++++
 .../v1/local/validate_bpe_lexicon.py          |  77 ++++
 egs/commonvoice/v1/path.sh                    |   5 +
 egs/commonvoice/v1/run_001_prepare_data.sh    |  50 +++
 .../v1/run_003_prepare_noises_rirs.sh         |  67 +++
 egs/commonvoice/v1/run_004_compute_bpe.sh     | 105 +++++
 egs/commonvoice/v1/run_011_train_asr.sh       | 119 +++++
 egs/commonvoice/v1/run_030_inference.sh       |  47 ++
 egs/commonvoice/v1/steps                      |   1 +
 egs/commonvoice/v1/steps_be                   |   1 +
 egs/commonvoice/v1/steps_pyfe                 |   1 +
 egs/commonvoice/v1/steps_transducer           |   1 +
 egs/commonvoice/v1/steps_xvec                 |   1 +
 egs/commonvoice/v1/utils                      |   1 +
 egs/commonvoice/v1/xvectors                   |   1 +
 .../decode_wav2vec2transducer.sh              |  80 ++++
 .../preprocess_audios_for_nnet_train.sh       | 112 +++++
 hyperion/bin/preprocess_audio_files.py        |   8 +
 55 files changed, 2673 insertions(+)
 create mode 100755 egs/commonvoice/v1/cmd.sh
 create mode 100644 egs/commonvoice/v1/conf/clsp.conf
 create mode 100644 egs/commonvoice/v1/conf/coe_gpu_bigmem.conf
 create mode 100644 egs/commonvoice/v1/conf/coe_gpu_long.conf
 create mode 100644 egs/commonvoice/v1/conf/coe_gpu_rtx.conf
 create mode 100644 egs/commonvoice/v1/conf/coe_gpu_short.conf
 create mode 100644 egs/commonvoice/v1/conf/coe_gpu_v100.conf
 create mode 100644 egs/commonvoice/v1/conf/reverb_noise_aug.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml
 create mode 100644 egs/commonvoice/v1/datapath.sh
 create mode 120000 egs/commonvoice/v1/default_config.sh
 create mode 120000 egs/commonvoice/v1/feats
 create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
 create mode 120000 egs/commonvoice/v1/hyp_utils
 create mode 100755 egs/commonvoice/v1/local/data_prep.sh
 create mode 100755 egs/commonvoice/v1/local/make_musan.py
 create mode 100755 egs/commonvoice/v1/local/make_musan.sh
 create mode 100755 egs/commonvoice/v1/local/make_rirs_data.sh
 create mode 100755 egs/commonvoice/v1/local/prepare_lang.py
 create mode 100755 egs/commonvoice/v1/local/prepare_lang_bpe.py
 create mode 100755 egs/commonvoice/v1/local/train_bpe_model.py
 create mode 100755 egs/commonvoice/v1/local/validate_bpe_lexicon.py
 create mode 100755 egs/commonvoice/v1/path.sh
 create mode 100755 egs/commonvoice/v1/run_001_prepare_data.sh
 create mode 100755 egs/commonvoice/v1/run_003_prepare_noises_rirs.sh
 create mode 100755 egs/commonvoice/v1/run_004_compute_bpe.sh
 create mode 100755 egs/commonvoice/v1/run_011_train_asr.sh
 create mode 100755 egs/commonvoice/v1/run_030_inference.sh
 create mode 120000 egs/commonvoice/v1/steps
 create mode 120000 egs/commonvoice/v1/steps_be
 create mode 120000 egs/commonvoice/v1/steps_pyfe
 create mode 120000 egs/commonvoice/v1/steps_transducer
 create mode 120000 egs/commonvoice/v1/steps_xvec
 create mode 120000 egs/commonvoice/v1/utils
 create mode 120000 egs/commonvoice/v1/xvectors
 create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
 create mode 100755 hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh

diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh
new file mode 100755
index 00000000..89dbb7d8
--- /dev/null
+++ b/egs/commonvoice/v1/cmd.sh
@@ -0,0 +1,28 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+if [ "$(hostname -d)" == "cm.gemini" ];then
+    export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G"
+    #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G"
+    export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
+    #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G"
+    #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+else
+    export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " 
+    export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G"
+    export cuda_eval_cmd="$train_cmd"
+fi
+
+
+
diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf
new file mode 100644
index 00000000..959c62a7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/clsp.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*'
+option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
diff --git a/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf
new file mode 100644
index 00000000..a7a2ce40
--- /dev/null
+++ b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]*
diff --git a/egs/commonvoice/v1/conf/coe_gpu_long.conf b/egs/commonvoice/v1/conf/coe_gpu_long.conf
new file mode 100644
index 00000000..b31c167c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/coe_gpu_long.conf
@@ -0,0 +1,13 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]*
+
+
diff --git a/egs/commonvoice/v1/conf/coe_gpu_rtx.conf b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf
new file mode 100644
index 00000000..ba6d9e56
--- /dev/null
+++ b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx 
diff --git a/egs/commonvoice/v1/conf/coe_gpu_short.conf b/egs/commonvoice/v1/conf/coe_gpu_short.conf
new file mode 100644
index 00000000..81de5cb7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/coe_gpu_short.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]*
diff --git a/egs/commonvoice/v1/conf/coe_gpu_v100.conf b/egs/commonvoice/v1/conf/coe_gpu_v100.conf
new file mode 100644
index 00000000..69326b82
--- /dev/null
+++ b/egs/commonvoice/v1/conf/coe_gpu_v100.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100
diff --git a/egs/commonvoice/v1/conf/reverb_noise_aug.yaml b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml
new file mode 100644
index 00000000..4fdf8068
--- /dev/null
+++ b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml
@@ -0,0 +1,35 @@
+reverb_aug:
+  reverb_prob: 0.45
+  max_reverb_context: 0.5
+  rir_types: 
+    smallroom:
+      weight: 1
+      rir_path: scp:data/rirs_smallroom/rirs.scp
+      rir_norm: max
+    mediumroom:
+      weight: 1
+      rir_path: scp:data/rirs_mediumroom/rirs.scp
+      rir_norm: max
+    realroom:
+      weight: 1
+      rir_path: scp:data/rirs_real/rirs.scp
+      rir_norm: max
+noise_aug:
+  noise_prob: 0.7
+  noise_types: 
+    noise:
+      weight: 1
+      noise_path: data/musan_noise_proc_audio/wav.scp
+      min_snr: 0
+      max_snr: 18
+    music:
+      weight: 1
+      noise_path: data/musan_music_proc_audio/wav.scp
+      min_snr: 3
+      max_snr: 18
+    babble:
+      weight: 1
+      noise_path: data/musan_speech_babble/wav.scp
+      min_snr: 3
+      max_snr: 18
+
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
new file mode 100644
index 00000000..edc0af5e
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
new file mode 100644
index 00000000..aefddc7e
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_enclast.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml
new file mode 100644
index 00000000..49077fd6
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml
new file mode 100644
index 00000000..9f070bbe
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml
new file mode 100644
index 00000000..d787a373
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do0.3.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
new file mode 100644
index 00000000..564ea8c7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do0.4.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml
new file mode 100644
index 00000000..35b2b47c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 80.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 80
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do0.4.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml
new file mode 100644
index 00000000..855bfc98
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 80.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 80
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_transducer_do0.4.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 42000
+    hold_steps: 15000
+    min_lr: 4e-5
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 1200
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml
new file mode 100644
index 00000000..0f328e08
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml
@@ -0,0 +1,56 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      sampler_type: 'seg_sampler'
+      # sampler_type: 'bucketing_seg_sampler'
+      min_batch_size: 4
+      batch_size: 4
+      iters_per_epoch: 6
+      drop_last: true
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      sampler_type: 'seg_sampler'
+      # sampler_type: 'bucketing_seg_sampler'
+      min_batch_size: 2
+      batch_size: 2
+      iters_per_epoch: 6
+      drop_last: true
+    data_loader:
+      num_workers: 8
+model: {}
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 60
+  eff_batch_size: 1024
+  train_mode: full
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
new file mode 100644
index 00000000..69c489b0
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
@@ -0,0 +1,61 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: 
+  transducer:
+    decoder:
+      override_dropouts: true
+      embedding_dropout_rate: 0.3
+      rnn_dropout_rate: 0.3
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
new file mode 100644
index 00000000..8017f9b3
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
@@ -0,0 +1,61 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 1
+      drop_last: false
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 1
+      drop_last: true
+    data_loader:
+      num_workers: 4
+model: 
+  transducer:
+    decoder:
+      override_dropouts: true
+      embedding_dropout_rate: 0.4
+      rnn_dropout_rate: 0.4
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml
new file mode 100644
index 00000000..a7071b8c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml
@@ -0,0 +1,14 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+    #embedding_dim: 128
+    #num_layers: 1
+    #hidden_dim: 64
+  joiner:
+    num_layers: 1
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
new file mode 100644
index 00000000..c7fc2df7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
@@ -0,0 +1,13 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+    embedding_dropout_rate: 0.1
+    rnn_dropout_rate: 0.1
+  joiner:
+    num_layers: 1
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
new file mode 100644
index 00000000..1ee4ec72
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
@@ -0,0 +1,13 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+    embedding_dropout_rate: 0.2
+    rnn_dropout_rate: 0.2
+  joiner:
+    num_layers: 1
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
new file mode 100644
index 00000000..ca7c1995
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
@@ -0,0 +1,13 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+    embedding_dropout_rate: 0.3
+    rnn_dropout_rate: 0.3
+  joiner:
+    num_layers: 1
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
new file mode 100644
index 00000000..9fed09e7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
@@ -0,0 +1,13 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+    embedding_dropout_rate: 0.4
+    rnn_dropout_rate: 0.4
+  joiner:
+    num_layers: 1
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml
new file mode 100644
index 00000000..1d46c33c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml
@@ -0,0 +1,11 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+transducer:
+  decoder:
+    embedding_dim: 1024
+    num_layers: 2
+    hidden_dim: 512
+  joiner:
+    num_layers: 1
+feat_fusion_method: last
+
diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh
new file mode 100644
index 00000000..4c7987ef
--- /dev/null
+++ b/egs/commonvoice/v1/datapath.sh
@@ -0,0 +1,22 @@
+# Copyright
+#            2018   Johns Hopkins University (Author: Jesus Villalba)
+#
+# Paths to the databases used in the experiment
+
+
+if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
+  librispeech_root=/export/corpora5/LibriSpeech 
+  musan_root=/export/corpora5/JHU/musan
+elif [ "$(hostname --domain)" == "cm.gemini" ];then
+  # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1
+  # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2
+  # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2
+  # musan_root=/expscratch/dgromero/corpora-open/musan
+  echo "Put your database paths here"
+  exit 1
+else
+  echo "Put your database paths here"
+  exit 1
+fi
+
+
diff --git a/egs/commonvoice/v1/default_config.sh b/egs/commonvoice/v1/default_config.sh
new file mode 120000
index 00000000..2b6239b6
--- /dev/null
+++ b/egs/commonvoice/v1/default_config.sh
@@ -0,0 +1 @@
+global_conf/config_transducer_v1.sh
\ No newline at end of file
diff --git a/egs/commonvoice/v1/feats b/egs/commonvoice/v1/feats
new file mode 120000
index 00000000..7b9d122a
--- /dev/null
+++ b/egs/commonvoice/v1/feats
@@ -0,0 +1 @@
+hyp_utils/feats
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh
new file mode 100644
index 00000000..4800e6fe
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh
@@ -0,0 +1,39 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=sv_train_proc_audio
+dev_data=sv_dev_proc_audio
+test_data=sv_test_proc_audio
+
+bpe_model=data/lang_bpe_1000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_transducer_v3.3
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0040.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0120.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
new file mode 100644
index 00000000..c0fbe9dc
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
@@ -0,0 +1,41 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=it_train_proc_audio
+dev_data=it_dev_proc_audio
+test_data=it_test_proc_audio
+
+language=it
+
+bpe_model=data/it_lang_bpe_1000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_transducer_v3.3_it
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/hyp_utils b/egs/commonvoice/v1/hyp_utils
new file mode 120000
index 00000000..f6d1eb7a
--- /dev/null
+++ b/egs/commonvoice/v1/hyp_utils
@@ -0,0 +1 @@
+../../../hyp_utils
\ No newline at end of file
diff --git a/egs/commonvoice/v1/local/data_prep.sh b/egs/commonvoice/v1/local/data_prep.sh
new file mode 100755
index 00000000..d68c2368
--- /dev/null
+++ b/egs/commonvoice/v1/local/data_prep.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+. ./cmd.sh
+. ./path.sh
+
+if [ "$#" -ne 3 ]; then
+  echo "Usage: $0 <language> <src-dir> <dst-dir>"
+  echo "e.g.: $0 ${language} /export/c06/ylu125/GSP/corpora/CommonVoice data/"
+  exit 1
+fi
+
+language=$1
+src=$2
+dst=$3
+
+if [ ! -d $src/cv-corpus-12.0-2022-12-07/${language} ]; then
+  wget https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-12.0-2022-12-07/cv-corpus-12.0-2022-12-07-${language}.tar.gz 
+  tar -xvzf cv-corpus-12.0-2022-12-07-${language}.tar.gz -C $src
+  rm cv-corpus-12.0-2022-12-07-${language}.tar.gz 
+fi
+
+
+lhotse prepare commonvoice -l ${language} $src/cv-corpus-12.0-2022-12-07/ ${dst}/${language}
+
+
+for part in dev test train
+do
+  lhotse kaldi export ${dst}/${language}/cv-${language}_recordings_${part}.jsonl.gz ${dst}/${language}/cv-${language}_supervisions_${part}.jsonl.gz  ${dst}/${language}_${part}
+  utils/utt2spk_to_spk2utt.pl ${dst}/${language}_${part}/utt2spk > ${dst}/${language}_${part}/spk2utt
+  utils/fix_data_dir.sh ${dst}/${language}_${part} 
+  steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_}
+done
+
diff --git a/egs/commonvoice/v1/local/make_musan.py b/egs/commonvoice/v1/local/make_musan.py
new file mode 100755
index 00000000..b0ae6846
--- /dev/null
+++ b/egs/commonvoice/v1/local/make_musan.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# Copyright 2015   David Snyder
+# Copyright 2019   Johns Hopkins University (Jesus Villalba) (added fs support)
+# Apache 2.0.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys
+
+
+def process_music_annotations(path):
+    utt2spk = {}
+    utt2vocals = {}
+    lines = open(path, "r").readlines()
+    for line in lines:
+        utt, genres, vocals, musician = line.rstrip().split()[:4]
+        # For this application, the musican ID isn't important
+        utt2spk[utt] = utt
+        utt2vocals[utt] = vocals == "Y"
+    return utt2spk, utt2vocals
+
+
+def prepare_music(root_dir, fs, use_vocals):
+    utt2vocals = {}
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    music_dir = os.path.join(root_dir, "music")
+    for root, dirs, files in os.walk(music_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+            elif str(file) == "ANNOTATIONS":
+                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+                utt2spk.update(utt2spk_part)
+                utt2vocals.update(utt2vocals_part)
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2vocals:
+        if utt in utt2wav:
+            if use_vocals or not utt2vocals[utt]:
+                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+                if fs == 8:
+                    utt2wav_str = (
+                        utt2wav_str
+                        + utt
+                        + " sox -t wav "
+                        + utt2wav[utt]
+                        + " -r 8k -t wav - |\n"
+                    )
+                else:
+                    utt2wav_str = (
+                        utt2wav_str
+                        + utt
+                        + " sox -t wav "
+                        + utt2wav[utt]
+                        + " -r 16k -t wav - |\n"
+                    )
+            num_good_files += 1
+        else:
+            print("Missing file", utt)
+            num_bad_files += 1
+    print(
+        "In music directory, processed",
+        num_good_files,
+        "files;",
+        num_bad_files,
+        "had missing wav data",
+    )
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_speech(root_dir, fs):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    speech_dir = os.path.join(root_dir, "speech")
+    for root, dirs, files in os.walk(speech_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if fs == 8:
+                utt2wav_str = (
+                    utt2wav_str
+                    + utt
+                    + " sox -t wav "
+                    + utt2wav[utt]
+                    + " -r 8k -t wav - |\n"
+                )
+            else:
+                utt2wav_str = (
+                    utt2wav_str
+                    + utt
+                    + " sox -t wav "
+                    + utt2wav[utt]
+                    + " -r 16k -t wav - |\n"
+                )
+            num_good_files += 1
+        else:
+            print("Missing file", utt)
+            num_bad_files += 1
+    print(
+        "In speech directory, processed",
+        num_good_files,
+        "files;",
+        num_bad_files,
+        "had missing wav data",
+    )
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_noise(root_dir, fs):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    noise_dir = os.path.join(root_dir, "noise")
+    for root, dirs, files in os.walk(noise_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if fs == 8:
+                utt2wav_str = (
+                    utt2wav_str
+                    + utt
+                    + " sox -t wav "
+                    + utt2wav[utt]
+                    + " -r 8k -t wav - |\n"
+                )
+            else:
+                utt2wav_str = (
+                    utt2wav_str
+                    + utt
+                    + " sox -t wav "
+                    + utt2wav[utt]
+                    + " -r 16k -t wav - |\n"
+                )
+            num_good_files += 1
+        else:
+            print("Missing file", utt)
+            num_bad_files += 1
+    print(
+        "In noise directory, processed",
+        num_good_files,
+        "files;",
+        num_bad_files,
+        "had missing wav data",
+    )
+    return utt2spk_str, utt2wav_str
+
+
+def main():
+    in_dir = sys.argv[1]
+    fs = int(sys.argv[2])
+    out_dir = sys.argv[3]
+    use_vocals = sys.argv[4] == "Y"
+    utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals)
+    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs)
+    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs)
+    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+    wav_fi = open(os.path.join(out_dir, "wav.scp"), "w")
+    wav_fi.write(utt2wav)
+    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w")
+    utt2spk_fi.write(utt2spk)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/commonvoice/v1/local/make_musan.sh b/egs/commonvoice/v1/local/make_musan.sh
new file mode 100755
index 00000000..4a6d30f9
--- /dev/null
+++ b/egs/commonvoice/v1/local/make_musan.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+# Copyright 2019   Johns Hopkins University (Jesus Villalba) (added fs support)
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the MUSAN
+# data directory. The required dataset is freely available at
+#   http://www.openslr.org/17/
+
+set -e
+use_vocals='Y'
+
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ];then
+    echo "Usage: $0 [options] <in-dir> <fs> <data-dir>";
+    echo "e.g.: $0 /export/corpora/JHU/musan 8 data"
+    exit 1;
+fi
+
+in_dir=$1
+fs=$2
+data_dir=$3
+
+mkdir -p $data_dir/musan.tmp
+
+echo "Preparing ${data_dir}/musan..."
+mkdir -p ${data_dir}/musan
+local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals}
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise
+utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \
+  ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \
+  ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \
+  ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf $data_dir/musan.tmp
+
diff --git a/egs/commonvoice/v1/local/make_rirs_data.sh b/egs/commonvoice/v1/local/make_rirs_data.sh
new file mode 100755
index 00000000..c6652eda
--- /dev/null
+++ b/egs/commonvoice/v1/local/make_rirs_data.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# Copyright 2020 Johns Hopkins University (Jesus Villalba)
+#           
+# Apache 2.0.
+set -e
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <rir-dir> <fs> <data-dir>"
+  echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom"
+fi
+
+rir_dir=$1
+fs=$2
+data_dir=$3
+
+mkdir -p $data_dir
+
+rir_list=$rir_dir/rir_list
+if [ "$fs" -eq 16 ];then
+    awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp
+else
+    awk '{ 
+key=$5; sub(/.*\//,"",key); 
+print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \
+    $rir_list > $data_dir/wav.scp
+fi
+awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room
+
diff --git a/egs/commonvoice/v1/local/prepare_lang.py b/egs/commonvoice/v1/local/prepare_lang.py
new file mode 100755
index 00000000..39d76146
--- /dev/null
+++ b/egs/commonvoice/v1/local/prepare_lang.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
+consisting of words and tokens (i.e., phones) and does the following:
+
+1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
+
+2. Generate tokens.txt, the token table mapping a token to a unique integer.
+
+3. Generate words.txt, the word table mapping a word to a unique integer.
+
+4. Generate L.pt, in k2 format. It can be loaded by
+
+        d = torch.load("L.pt")
+        lexicon = k2.Fsa.from_dict(d)
+
+5. Generate L_disambig.pt, in k2 format.
+"""
+import argparse
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import k2
+import torch
+
+from hyperion.utils.lexicon import read_lexicon, write_lexicon
+
+Lexicon = List[Tuple[str, List[str]]]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        It should contain a file lexicon.txt.
+        Generated files by this script are saved into this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="""True for debugging, which will generate
+        a visualization of the lexicon FST.
+
+        Caution: If your lexicon contains hundreds of thousands
+        of lines, please set it to False!
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
+    """Write a symbol to ID mapping to a file.
+
+    Note:
+      No need to implement `read_mapping` as it can be done
+      through :func:`k2.SymbolTable.from_file`.
+
+    Args:
+      filename:
+        Filename to save the mapping.
+      sym2id:
+        A dict mapping symbols to IDs.
+    Returns:
+      Return None.
+    """
+    with open(filename, "w", encoding="utf-8") as f:
+        for sym, i in sym2id.items():
+            f.write(f"{sym} {i}\n")
+
+
+def get_tokens(lexicon: Lexicon) -> List[str]:
+    """Get tokens from a lexicon.
+
+    Args:
+      lexicon:
+        It is the return value of :func:`read_lexicon`.
+    Returns:
+      Return a list of unique tokens.
+    """
+    ans = set()
+    for _, tokens in lexicon:
+        ans.update(tokens)
+    sorted_ans = sorted(list(ans))
+    return sorted_ans
+
+
+def get_words(lexicon: Lexicon) -> List[str]:
+    """Get words from a lexicon.
+
+    Args:
+      lexicon:
+        It is the return value of :func:`read_lexicon`.
+    Returns:
+      Return a list of unique words.
+    """
+    ans = set()
+    for word, _ in lexicon:
+        ans.add(word)
+    sorted_ans = sorted(list(ans))
+    return sorted_ans
+
+
+def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
+    """It adds pseudo-token disambiguation symbols #1, #2 and so on
+    at the ends of tokens to ensure that all pronunciations are different,
+    and that none is a prefix of another.
+
+    See also add_lex_disambig.pl from kaldi.
+
+    Args:
+      lexicon:
+        It is returned by :func:`read_lexicon`.
+    Returns:
+      Return a tuple with two elements:
+
+        - The output lexicon with disambiguation symbols
+        - The ID of the max disambiguation symbol that appears
+          in the lexicon
+    """
+
+    # (1) Work out the count of each token-sequence in the
+    # lexicon.
+    count = defaultdict(int)
+    for _, tokens in lexicon:
+        count[" ".join(tokens)] += 1
+
+    # (2) For each left sub-sequence of each token-sequence, note down
+    # that it exists (for identifying prefixes of longer strings).
+    issubseq = defaultdict(int)
+    for _, tokens in lexicon:
+        tokens = tokens.copy()
+        tokens.pop()
+        while tokens:
+            issubseq[" ".join(tokens)] = 1
+            tokens.pop()
+
+    # (3) For each entry in the lexicon:
+    # if the token sequence is unique and is not a
+    # prefix of another word, no disambig symbol.
+    # Else output #1, or #2, #3, ... if the same token-seq
+    # has already been assigned a disambig symbol.
+    ans = []
+
+    # We start with #1 since #0 has its own purpose
+    first_allowed_disambig = 1
+    max_disambig = first_allowed_disambig - 1
+    last_used_disambig_symbol_of = defaultdict(int)
+
+    for word, tokens in lexicon:
+        tokenseq = " ".join(tokens)
+        assert tokenseq != ""
+        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
+            ans.append((word, tokens))
+            continue
+
+        cur_disambig = last_used_disambig_symbol_of[tokenseq]
+        if cur_disambig == 0:
+            cur_disambig = first_allowed_disambig
+        else:
+            cur_disambig += 1
+
+        if cur_disambig > max_disambig:
+            max_disambig = cur_disambig
+        last_used_disambig_symbol_of[tokenseq] = cur_disambig
+        tokenseq += f" #{cur_disambig}"
+        ans.append((word, tokenseq.split()))
+    return ans, max_disambig
+
+
+def generate_id_map(symbols: List[str]) -> Dict[str, int]:
+    """Generate ID maps, i.e., map a symbol to a unique ID.
+
+    Args:
+      symbols:
+        A list of unique symbols.
+    Returns:
+      A dict containing the mapping between symbols and IDs.
+    """
+    return {sym: i for i, sym in enumerate(symbols)}
+
+
+def add_self_loops(arcs: List[List[Any]], disambig_token: int,
+                   disambig_word: int) -> List[List[Any]]:
+    """Adds self-loops to states of an FST to propagate disambiguation symbols
+    through it. They are added on each state with non-epsilon output symbols
+    on at least one arc out of the state.
+
+    See also fstaddselfloops.pl from Kaldi. One difference is that
+    Kaldi uses OpenFst style FSTs and it has multiple final states.
+    This function uses k2 style FSTs and it does not need to add self-loops
+    to the final state.
+
+    The input label of a self-loop is `disambig_token`, while the output
+    label is `disambig_word`.
+
+    Args:
+      arcs:
+        A list-of-list. The sublist contains
+        `[src_state, dest_state, label, aux_label, score]`
+      disambig_token:
+        It is the token ID of the symbol `#0`.
+      disambig_word:
+        It is the word ID of the symbol `#0`.
+
+    Return:
+      Return new `arcs` containing self-loops.
+    """
+    states_needs_self_loops = set()
+    for arc in arcs:
+        src, dst, ilabel, olabel, score = arc
+        if olabel != 0:
+            states_needs_self_loops.add(src)
+
+    ans = []
+    for s in states_needs_self_loops:
+        ans.append([s, s, disambig_token, disambig_word, 0])
+
+    return arcs + ans
+
+
+def lexicon_to_fst(
+    lexicon: Lexicon,
+    token2id: Dict[str, int],
+    word2id: Dict[str, int],
+    sil_token: str = "SIL",
+    sil_prob: float = 0.5,
+    need_self_loops: bool = False,
+) -> k2.Fsa:
+    """Convert a lexicon to an FST (in k2 format) with optional silence at
+    the beginning and end of each word.
+
+    Args:
+      lexicon:
+        The input lexicon. See also :func:`read_lexicon`
+      token2id:
+        A dict mapping tokens to IDs.
+      word2id:
+        A dict mapping words to IDs.
+      sil_token:
+        The silence token.
+      sil_prob:
+        The probability for adding a silence at the beginning and end
+        of the word.
+      need_self_loops:
+        If True, add self-loop to states with non-epsilon output symbols
+        on at least one arc out of the state. The input label for this
+        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
+    Returns:
+      Return an instance of `k2.Fsa` representing the given lexicon.
+    """
+    assert sil_prob > 0.0 and sil_prob < 1.0
+    # CAUTION: we use score, i.e, negative cost.
+    sil_score = math.log(sil_prob)
+    no_sil_score = math.log(1.0 - sil_prob)
+
+    start_state = 0
+    loop_state = 1  # words enter and leave from here
+    sil_state = 2  # words terminate here when followed by silence; this state
+    # has a silence transition to loop_state.
+    next_state = 3  # the next un-allocated state, will be incremented as we go.
+    arcs = []
+
+    assert token2id["<eps>"] == 0
+    assert word2id["<eps>"] == 0
+
+    eps = 0
+
+    sil_token = token2id[sil_token]
+
+    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
+    arcs.append([start_state, sil_state, eps, eps, sil_score])
+    arcs.append([sil_state, loop_state, sil_token, eps, 0])
+
+    for word, tokens in lexicon:
+        assert len(tokens) > 0, f"{word} has no pronunciations"
+        cur_state = loop_state
+
+        word = word2id[word]
+        tokens = [token2id[i] for i in tokens]
+
+        for i in range(len(tokens) - 1):
+            w = word if i == 0 else eps
+            arcs.append([cur_state, next_state, tokens[i], w, 0])
+
+            cur_state = next_state
+            next_state += 1
+
+        # now for the last token of this word
+        # It has two out-going arcs, one to the loop state,
+        # the other one to the sil_state.
+        i = len(tokens) - 1
+        w = word if i == 0 else eps
+        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
+        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
+
+    if need_self_loops:
+        disambig_token = token2id["#0"]
+        disambig_word = word2id["#0"]
+        arcs = add_self_loops(
+            arcs,
+            disambig_token=disambig_token,
+            disambig_word=disambig_word,
+        )
+
+    final_state = next_state
+    arcs.append([loop_state, final_state, -1, -1, 0])
+    arcs.append([final_state])
+
+    arcs = sorted(arcs, key=lambda arc: arc[0])
+    arcs = [[str(i) for i in arc] for arc in arcs]
+    arcs = [" ".join(arc) for arc in arcs]
+    arcs = "\n".join(arcs)
+
+    fsa = k2.Fsa.from_str(arcs, acceptor=False)
+    return fsa
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+    lexicon_filename = lang_dir / "lexicon.txt"
+    sil_token = "SIL"
+    sil_prob = 0.5
+
+    lexicon = read_lexicon(lexicon_filename)
+    tokens = get_tokens(lexicon)
+    words = get_words(lexicon)
+
+    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
+
+    for i in range(max_disambig + 1):
+        disambig = f"#{i}"
+        assert disambig not in tokens
+        tokens.append(f"#{i}")
+
+    assert "<eps>" not in tokens
+    tokens = ["<eps>"] + tokens
+
+    assert "<eps>" not in words
+    assert "#0" not in words
+    assert "<s>" not in words
+    assert "</s>" not in words
+
+    words = ["<eps>"] + words + ["#0", "<s>", "</s>"]
+
+    token2id = generate_id_map(tokens)
+    word2id = generate_id_map(words)
+
+    write_mapping(lang_dir / "tokens.txt", token2id)
+    write_mapping(lang_dir / "words.txt", word2id)
+    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
+
+    L = lexicon_to_fst(
+        lexicon,
+        token2id=token2id,
+        word2id=word2id,
+        sil_token=sil_token,
+        sil_prob=sil_prob,
+    )
+
+    L_disambig = lexicon_to_fst(
+        lexicon_disambig,
+        token2id=token2id,
+        word2id=word2id,
+        sil_token=sil_token,
+        sil_prob=sil_prob,
+        need_self_loops=True,
+    )
+    torch.save(L.as_dict(), lang_dir / "L.pt")
+    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
+
+    if args.debug:
+        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
+        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+        L.labels_sym = labels_sym
+        L.aux_labels_sym = aux_labels_sym
+        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
+
+        L_disambig.labels_sym = labels_sym
+        L_disambig.aux_labels_sym = aux_labels_sym
+        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}",
+                        title="L_disambig.pt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/commonvoice/v1/local/prepare_lang_bpe.py b/egs/commonvoice/v1/local/prepare_lang_bpe.py
new file mode 100755
index 00000000..7838b6a0
--- /dev/null
+++ b/egs/commonvoice/v1/local/prepare_lang_bpe.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
+"""
+
+This script takes as input `lang_dir`, which should contain::
+
+    - lang_dir/bpe.model,
+    - lang_dir/words.txt
+
+and generates the following files in the directory `lang_dir`:
+
+    - lexicon.txt
+    - lexicon_disambig.txt
+    - L.pt
+    - L_disambig.pt
+    - tokens.txt
+"""
+
+import argparse
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+from prepare_lang import (
+    Lexicon,
+    add_disambig_symbols,
+    add_self_loops,
+    write_lexicon,
+    write_mapping,
+)
+
+
+def lexicon_to_fst_no_sil(
+    lexicon: Lexicon,
+    token2id: Dict[str, int],
+    word2id: Dict[str, int],
+    need_self_loops: bool = False,
+) -> k2.Fsa:
+    """Convert a lexicon to an FST (in k2 format).
+
+    Args:
+      lexicon:
+        The input lexicon. See also :func:`read_lexicon`
+      token2id:
+        A dict mapping tokens to IDs.
+      word2id:
+        A dict mapping words to IDs.
+      need_self_loops:
+        If True, add self-loop to states with non-epsilon output symbols
+        on at least one arc out of the state. The input label for this
+        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
+    Returns:
+      Return an instance of `k2.Fsa` representing the given lexicon.
+    """
+    loop_state = 0  # words enter and leave from here
+    next_state = 1  # the next un-allocated state, will be incremented as we go
+
+    arcs = []
+
+    # The blank symbol <blk> is defined in local/train_bpe_model.py
+    assert token2id["<blk>"] == 0
+    assert word2id["<eps>"] == 0
+
+    eps = 0
+
+    for word, pieces in lexicon:
+        assert len(pieces) > 0, f"{word} has no pronunciations"
+        cur_state = loop_state
+
+        word = word2id[word]
+        pieces = [token2id[i] for i in pieces]
+
+        for i in range(len(pieces) - 1):
+            w = word if i == 0 else eps
+            arcs.append([cur_state, next_state, pieces[i], w, 0])
+
+            cur_state = next_state
+            next_state += 1
+
+        # now for the last piece of this word
+        i = len(pieces) - 1
+        w = word if i == 0 else eps
+        arcs.append([cur_state, loop_state, pieces[i], w, 0])
+
+    if need_self_loops:
+        disambig_token = token2id["#0"]
+        disambig_word = word2id["#0"]
+        arcs = add_self_loops(
+            arcs,
+            disambig_token=disambig_token,
+            disambig_word=disambig_word,
+        )
+
+    final_state = next_state
+    arcs.append([loop_state, final_state, -1, -1, 0])
+    arcs.append([final_state])
+
+    arcs = sorted(arcs, key=lambda arc: arc[0])
+    arcs = [[str(i) for i in arc] for arc in arcs]
+    arcs = [" ".join(arc) for arc in arcs]
+    arcs = "\n".join(arcs)
+
+    fsa = k2.Fsa.from_str(arcs, acceptor=False)
+    return fsa
+
+
+def generate_lexicon(model_file: str,
+                     words: List[str]) -> Tuple[Lexicon, Dict[str, int]]:
+    """Generate a lexicon from a BPE model.
+
+    Args:
+      model_file:
+        Path to a sentencepiece model.
+      words:
+        A list of strings representing words.
+    Returns:
+      Return a tuple with two elements:
+        - A dict whose keys are words and values are the corresponding
+          word pieces.
+        - A dict representing the token symbol, mapping from tokens to IDs.
+    """
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(model_file))
+
+    # Convert word to word piece IDs instead of word piece strings
+    # to avoid OOV tokens.
+    words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
+
+    # Now convert word piece IDs back to word piece strings.
+    words_pieces: List[List[str]] = [
+        sp.id_to_piece(ids) for ids in words_pieces_ids
+    ]
+
+    lexicon = []
+    for word, pieces in zip(words, words_pieces):
+        lexicon.append((word, pieces))
+
+    # The OOV word is <UNK>
+    lexicon.append(("<UNK>", [sp.id_to_piece(sp.unk_id())]))
+
+    token2id: Dict[str, int] = dict()
+    for i in range(sp.vocab_size()):
+        token2id[sp.id_to_piece(i)] = i
+
+    return lexicon, token2id
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        It should contain the bpe.model and words.txt
+        """,
+    )
+
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="""True for debugging, which will generate
+        a visualization of the lexicon FST.
+
+        Caution: If your lexicon contains hundreds of thousands
+        of lines, please set it to False!
+
+        See "test/test_bpe_lexicon.py" for usage.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+    model_file = lang_dir / "bpe.model"
+
+    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+    words = word_sym_table.symbols
+
+    excluded = [
+        "<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"
+    ]
+    for w in excluded:
+        if w in words:
+            words.remove(w)
+
+    lexicon, token_sym_table = generate_lexicon(model_file, words)
+
+    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
+
+    next_token_id = max(token_sym_table.values()) + 1
+    for i in range(max_disambig + 1):
+        disambig = f"#{i}"
+        assert disambig not in token_sym_table
+        token_sym_table[disambig] = next_token_id
+        next_token_id += 1
+
+    word_sym_table.add("#0")
+    word_sym_table.add("<s>")
+    word_sym_table.add("</s>")
+
+    write_mapping(lang_dir / "tokens.txt", token_sym_table)
+
+    write_lexicon(lang_dir / "lexicon.txt", lexicon)
+    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
+
+    L = lexicon_to_fst_no_sil(
+        lexicon,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+    )
+
+    L_disambig = lexicon_to_fst_no_sil(
+        lexicon_disambig,
+        token2id=token_sym_table,
+        word2id=word_sym_table,
+        need_self_loops=True,
+    )
+    torch.save(L.as_dict(), lang_dir / "L.pt")
+    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
+
+    if args.debug:
+        labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt")
+        aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt")
+
+        L.labels_sym = labels_sym
+        L.aux_labels_sym = aux_labels_sym
+        L.draw(f"{lang_dir / 'L.svg'}", title="L.pt")
+
+        L_disambig.labels_sym = labels_sym
+        L_disambig.aux_labels_sym = aux_labels_sym
+        L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}",
+                        title="L_disambig.pt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/commonvoice/v1/local/train_bpe_model.py b/egs/commonvoice/v1/local/train_bpe_model.py
new file mode 100755
index 00000000..42aba957
--- /dev/null
+++ b/egs/commonvoice/v1/local/train_bpe_model.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# You can install sentencepiece via:
+#
+#  pip install sentencepiece
+#
+# Due to an issue reported in
+# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030
+#
+# Please install a version >=0.1.96
+
+import argparse
+import shutil
+from pathlib import Path
+
+import sentencepiece as spm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        The generated bpe.model is saved to this directory.
+        """,
+    )
+
+    parser.add_argument(
+        "--transcript",
+        type=str,
+        help="Training transcript.",
+    )
+
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        help="Vocabulary size for BPE training",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    vocab_size = args.vocab_size
+    lang_dir = Path(args.lang_dir)
+
+    model_type = "unigram"
+
+    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
+    train_text = args.transcript
+    character_coverage = 1.0
+    input_sentence_size = 100000000
+
+    user_defined_symbols = ["<blk>", "<sos/eos>"]
+    unk_id = len(user_defined_symbols)
+    # Note: unk_id is fixed to 2.
+    # If you change it, you should also change other
+    # places that are using it.
+
+    model_file = Path(model_prefix + ".model")
+    if not model_file.is_file():
+        spm.SentencePieceTrainer.train(
+            input=train_text,
+            vocab_size=vocab_size,
+            model_type=model_type,
+            model_prefix=model_prefix,
+            input_sentence_size=input_sentence_size,
+            character_coverage=character_coverage,
+            user_defined_symbols=user_defined_symbols,
+            unk_id=unk_id,
+            bos_id=-1,
+            eos_id=-1,
+        )
+
+    shutil.copyfile(model_file, f"{lang_dir}/bpe.model")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/commonvoice/v1/local/validate_bpe_lexicon.py b/egs/commonvoice/v1/local/validate_bpe_lexicon.py
new file mode 100755
index 00000000..36962933
--- /dev/null
+++ b/egs/commonvoice/v1/local/validate_bpe_lexicon.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks that there are no OOV tokens in the BPE-based lexicon.
+
+Usage example:
+
+    python3 ./local/validate_bpe_lexicon.py \
+            --lexicon /path/to/lexicon.txt \
+            --bpe-model /path/to/bpe.model
+"""
+
+import argparse
+from pathlib import Path
+from typing import List, Tuple
+
+import sentencepiece as spm
+
+from hyperion.utils.lexicon import read_lexicon
+
+# Map word to word pieces
+Lexicon = List[Tuple[str, List[str]]]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--lexicon",
+        required=True,
+        type=Path,
+        help="Path to lexicon.txt",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        required=True,
+        type=Path,
+        help="Path to bpe.model",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    assert args.lexicon.is_file(), args.lexicon
+    assert args.bpe_model.is_file(), args.bpe_model
+
+    lexicon = read_lexicon(args.lexicon)
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(args.bpe_model))
+
+    word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
+    for word, pieces in lexicon:
+        for p in pieces:
+            if p not in word_pieces:
+                raise ValueError(f"The word {word} contains an OOV token {p}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/commonvoice/v1/path.sh b/egs/commonvoice/v1/path.sh
new file mode 100755
index 00000000..6994fdab
--- /dev/null
+++ b/egs/commonvoice/v1/path.sh
@@ -0,0 +1,5 @@
+
+export HYP_ROOT=$(readlink -f `pwd -P`/../../..)
+export TOOLS_ROOT=$HYP_ROOT/tools
+
+. $TOOLS_ROOT/path.sh
diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh
new file mode 100755
index 00000000..d839fac6
--- /dev/null
+++ b/egs/commonvoice/v1/run_001_prepare_data.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright
+#                2018   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. ./datapath.sh 
+. $config_file
+
+
+nj=6
+
+mkdir -p data
+
+commonvoice=/export/c06/ylu125/GSP/corpora/CommonVoice
+
+
+if [ ${stage} -le 1 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Data preparation"
+    for lan in it #sv-SE
+    do
+      # use underscore-separated names in data directories.
+      local/data_prep.sh ${lan} $commonvoice data/
+    done
+fi
+
+if [ ${stage} -le 2 ]; then
+  echo "stage 2: Data conversion"
+  # for part in $test_data $dev_data $nnet_data
+  for lan in it #sv-SE
+  do
+    for part in ${lan}_test ${lan}_dev ${lan}_train
+    do
+      echo ${part}
+      steps_transducer/preprocess_audios_for_nnet_train.sh --nj 20 --cmd "$train_cmd" \
+      --storage_name commonvoice-v1-$(date +'%m_%d_%H_%M') --use-bin-vad false \
+      --osr 16000 data/${part} data/${part}_proc_audio  exp/${part}_proc_audio
+      utils/fix_data_dir.sh data/${part}_proc_audio || true
+    done
+  done
+fi
diff --git a/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh
new file mode 100755
index 00000000..6bdcb4f2
--- /dev/null
+++ b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+config_file=default_config.sh
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+# We prepare the noise files and RIR for online speech augmentation
+
+if [ $stage -le 1 ]; then
+
+    # Prepare the MUSAN corpus, which consists of music, speech, and noise
+    # suitable for augmentation.
+    local/make_musan.sh $musan_root 16 data
+
+    for name in musan_noise musan_music
+    do
+	steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \
+	    --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \
+	    data/${name} data/${name}_proc_audio exp/${name}_proc_audio
+	utils/fix_data_dir.sh data/${name}_proc_audio
+    done
+
+fi
+
+if [ $stage -le 2 ]; then
+
+    # Create Babble noise from MUSAN speech files
+    for name in musan_speech
+    do
+	steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \
+	    --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \
+	    data/${name} data/${name}_babble exp/${name}_babble
+	# utils/fix_data_dir.sh data/${name}_babble
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    if [ ! -d "RIRS_NOISES" ]; then
+	if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then
+	    ln -s ../../sre19-cmn2/v1/RIRS_NOISES
+	else
+	    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+	    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+	    unzip rirs_noises.zip
+	fi
+    fi
+    local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom
+    local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom
+    local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real
+    for rirs in rirs_smallroom rirs_mediumroom rirs_real
+    do
+	#pack all rirs in h5 files
+	steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs
+    done
+    
+fi
+
+
diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh
new file mode 100755
index 00000000..617f03ae
--- /dev/null
+++ b/egs/commonvoice/v1/run_004_compute_bpe.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+
+. ./cmd.sh
+. ./path.sh
+set -e
+
+vocab_sizes=(
+  # 5000
+  2000
+  1000
+  500
+)
+
+dl_dir=$PWD/download
+
+stage=1
+stop_stage=4
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. ./datapath.sh 
+. $config_file
+
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  echo "Stage 1: Dump transcripts for LM training"
+  mkdir -p data/lm
+  gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+    | jq '.text' \
+    | sed 's:"::g' \
+    > data/lm/${language}_transcript_words.txt
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  echo "Stage 2: Prepare BPE based lang"
+
+  for vocab_size in ${vocab_sizes[@]}; do
+    lang_dir=data/${language}_lang_bpe_${vocab_size}
+    mkdir -p $lang_dir
+
+    # Add special words to words.txt
+    echo "<eps> 0" > $lang_dir/words.txt
+    echo "!SIL 1" >> $lang_dir/words.txt
+    echo "<UNK> 2" >> $lang_dir/words.txt
+
+    # Add regular words to words.txt
+    gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+      | jq '.text' \
+      | sed 's:"::g' \
+      | sed 's: :\n:g' \
+      | sort \
+      | uniq \
+      | sed '/^$/d' \
+      | awk '{print $0,NR+2}' \
+      >> $lang_dir/words.txt
+
+    # Add remaining special word symbols expected by LM scripts.
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "<s> ${num_words}" >> $lang_dir/words.txt
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "</s> ${num_words}" >> $lang_dir/words.txt
+    num_words=$(cat $lang_dir/words.txt | wc -l)
+    echo "#0 ${num_words}" >> $lang_dir/words.txt
+
+    ./local/train_bpe_model.py \
+      --lang-dir $lang_dir \
+      --vocab-size $vocab_size \
+      --transcript data/lm/${language}_transcript_words.txt
+
+    if [ ! -f $lang_dir/L_disambig.pt ]; then
+      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+    fi
+  done
+fi
+
+# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+#   echo "Stage 3: Train LM"
+#   lm_dir=data/lm
+
+#   if [ ! -f $lm_dir/G.arpa ]; then
+#     ./shared/make_kn_lm.py \
+#       -ngram-order 3 \
+#       -text $lm_dir/transcript_words.txt \
+#       -lm $lm_dir/G.arpa
+#   fi
+
+#   if [ ! -f $lm_dir/G_3_gram.fst.txt ]; then
+#     python3 -m kaldilm \
+#       --read-symbol-table="data/lang_phone/words.txt" \
+#       --disambig-symbol='#0' \
+#       --max-order=3 \
+#       $lm_dir/G.arpa > $lm_dir/G_3_gram.fst.txt
+#   fi
+# fi
+
+# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+#   echo "Stage 4: Compile HLG"
+#   ./local/compile_hlg.py --lang-dir data/lang_phone
+
+#   for vocab_size in ${vocab_sizes[@]}; do
+#     lang_dir=data/lang_bpe_${vocab_size}
+#     ./local/compile_hlg.py --lang-dir $lang_dir
+#   done
+# fi
\ No newline at end of file
diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh
new file mode 100755
index 00000000..1b402133
--- /dev/null
+++ b/egs/commonvoice/v1/run_011_train_asr.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+ngpu=1
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_dir=data/${nnet_data}/
+val_dir=data/${dev_data}/
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+    extra_args="--data.val.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+if [ "$use_wandb" == "true" ];then
+  extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)"
+fi
+
+
+# Network Training
+if [ $stage -le 1 ]; then
+
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+    train_wav2vec2transducer.py $nnet_type \
+    --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s1_dir $args \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+
+fi
+
+if [ $stage -le 2 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer.py $nnet_type \
+    --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s2_dir $args \
+    --in-model-file $nnet_s1 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+  
+fi
+
+if [ $stage -le 3 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  
+
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer.py $nnet_type \
+    --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s3_dir $args \
+    --in-model-file $nnet_s2 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+fi
+
diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh
new file mode 100755
index 00000000..86dccf0a
--- /dev/null
+++ b/egs/commonvoice/v1/run_030_inference.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+config_file=default_config.sh
+use_gpu=false
+nnet_stage=1
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ "$use_gpu" == "true" ];then
+  transducer_args="--use-gpu true"
+  transducer_cmd="$cuda_eval_cmd --mem 6G"
+else
+  transducer_cmd="$train_cmd --mem 12G"
+fi
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+fi
+
+transducer_dir=exp/transducer/$nnet_name
+
+
+
+
+# Extracts x-vectors for evaluation
+for name in $dev_data $test_data 
+  do
+    nj=16
+    steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
+      $nnet data/$name \
+      $transducer_dir/$name $bpe_model
+  done
+exit
diff --git a/egs/commonvoice/v1/steps b/egs/commonvoice/v1/steps
new file mode 120000
index 00000000..aede39fe
--- /dev/null
+++ b/egs/commonvoice/v1/steps
@@ -0,0 +1 @@
+hyp_utils/kaldi/steps
\ No newline at end of file
diff --git a/egs/commonvoice/v1/steps_be b/egs/commonvoice/v1/steps_be
new file mode 120000
index 00000000..b2098c2a
--- /dev/null
+++ b/egs/commonvoice/v1/steps_be
@@ -0,0 +1 @@
+../v1/steps_be
\ No newline at end of file
diff --git a/egs/commonvoice/v1/steps_pyfe b/egs/commonvoice/v1/steps_pyfe
new file mode 120000
index 00000000..7b9d122a
--- /dev/null
+++ b/egs/commonvoice/v1/steps_pyfe
@@ -0,0 +1 @@
+hyp_utils/feats
\ No newline at end of file
diff --git a/egs/commonvoice/v1/steps_transducer b/egs/commonvoice/v1/steps_transducer
new file mode 120000
index 00000000..c9fd1392
--- /dev/null
+++ b/egs/commonvoice/v1/steps_transducer
@@ -0,0 +1 @@
+hyp_utils/steps_transducer
\ No newline at end of file
diff --git a/egs/commonvoice/v1/steps_xvec b/egs/commonvoice/v1/steps_xvec
new file mode 120000
index 00000000..289276b7
--- /dev/null
+++ b/egs/commonvoice/v1/steps_xvec
@@ -0,0 +1 @@
+hyp_utils/xvectors/
\ No newline at end of file
diff --git a/egs/commonvoice/v1/utils b/egs/commonvoice/v1/utils
new file mode 120000
index 00000000..3d590a1d
--- /dev/null
+++ b/egs/commonvoice/v1/utils
@@ -0,0 +1 @@
+hyp_utils/kaldi/utils
\ No newline at end of file
diff --git a/egs/commonvoice/v1/xvectors b/egs/commonvoice/v1/xvectors
new file mode 120000
index 00000000..af66a94d
--- /dev/null
+++ b/egs/commonvoice/v1/xvectors
@@ -0,0 +1 @@
+hyp_utils/xvectors
\ No newline at end of file
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
new file mode 100755
index 00000000..143087a5
--- /dev/null
+++ b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#               2022  Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+nj=30
+cmd="run.pl"
+
+use_gpu=false
+write_utt2num_frames=true  # If true writes utt2num_frames.
+stage=0
+num_augs=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ] && [ $# != 4 ]; then
+  echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> [<data-out-dir>]"
+  echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --use-gpu <bool|false>                           # If true, use GPU."
+  echo "  --nj <n|30>                                      # Number of jobs"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --use-bin-vad <bool|true>                        # If true, uses binary VAD from vad.scp"
+  echo "  --write-utt2num-frames <bool|tru>                # If true, write utt2num_frames file."
+  echo "  --chunk-length <n|0>                             # If provided, applies encoder with specified chunk-length and "
+  echo "                                                   # concatenates the chunks outputs before pooling"
+  echo "  --feat-config <str>                              # feature/mvn config file"
+  echo "  --aug-config <str>                               # augmentation config file"
+  echo "  --random-utt-length                              # If true, extracts a random chunk from the utterance between "
+  echo "                                                   # min_utt_length and max_utt_length"
+  echo "  --min-utt-length <n|0>                           # "
+  echo "  --max-utt-length <n|0>                           # "
+  
+
+fi
+
+nnet_file=$1
+data_dir=$2
+output_dir=$3
+bpe_model=$4
+
+for f in $data_dir/wav.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+log_dir=$output_dir/log
+mkdir -p $log_dir
+
+num_gpus=0
+args=""
+if [ "$use_gpu" == "true" ];then
+    cmd="$cmd --gpu 1"
+    num_gpus=1
+    args="--use-gpu"
+fi 
+
+if [ "$write_utt2num_frames" == "true" ];then
+    write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
+fi
+
+if [ $stage -le 0 ];then
+    set +e
+    $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \
+	hyp_utils/conda_env.sh --num-gpus $num_gpus \
+    decode_wav2transducer.py \
+    --part-idx JOB --num-parts $nj \
+    --input $data_dir/wav.scp \
+    --model-path $nnet_file \
+    --bpe-model $bpe_model \
+    --output $output_dir/transducer.JOB.text
+      set -e
+fi
+
+if [ $stage -le 1 ];then
+  echo "compute wer"
+  cat $output_dir/transducer.*.text > $output_dir/transducer.text
+  compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
+fi
diff --git a/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh
new file mode 100755
index 00000000..ef54ceed
--- /dev/null
+++ b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+#
+#           2020 Johns Hopkins University (Jesus Villalba)
+# Apache 2.0.
+set -e
+nj=40
+cmd="run.pl"
+stage=0
+file_format=flac
+nodes=b1
+storage_name=$(date +'%m_%d_%H_%M')
+proc_opts="--remove-dc-offset"
+use_bin_vad=false
+osr=16000
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --file-format <str|flac>                         # Output format supported by soundfile (flac,ogg,wav,...)"
+  echo "  --proc-opts <str|--remove-dc-offset>             # Extra arguments for proc-audio-files.py"
+  echo "  --use-bin-vad <true,false|false>                 # Removes silence using binary vad"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/wav.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+output_dir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then
+    dir_name=$USER/hyp-data/$storage_name/xvector_audio/storage
+    if [ "$nodes" == "b0" ];then
+	utils/create_split_dir.pl \
+	    utils/create_split_dir.pl \
+	    /export/b{04,05,06,07}/$dir_name $output_dir/storage
+    elif [ "$nodes" == "b1" ];then
+	utils/create_split_dir.pl \
+	    /export/b{14,15,16,17,18}/$dir_name $output_dir/storage
+    elif [ "$nodes" == "s01" ];then
+	utils/create_split_dir.pl \
+	    /export/s01/$dir_name $output_dir/storage
+    elif [ "$nodes" == "c0" ];then
+	utils/create_split_dir.pl \
+	    /export/c{01,06,07,08,09}/$dir_name $output_dir/storage
+    elif [ "$nodes" == "fs05" ];then
+	utils/create_split_dir.pl \
+	    utils/create_split_dir.pl \
+	    /export/fs05/$dir_name $output_dir/storage
+    fi
+
+    for f in $(awk '{ print $1}' $data_in/wav.scp); do
+	# the next command does nothing unless $output_dir/storage/ exists, see
+	# utils/create_data_link.pl for more info.
+	utils/create_data_link.pl $output_dir/$f.$file_format
+    done
+fi
+
+
+for f in reco2dur  segments  spk2utt  text  utt2dur  utt2gender  utt2lang  utt2spk  wav.scp spk2gender
+do
+    if [ -f $data_in/$f ];then
+	cp $data_in/$f $data_out/$f
+    fi
+done
+
+args=""
+if [ "$use_bin_vad" == "true" ];then
+    args="${args} --vad scp:$data_in/vad.scp"
+else
+    f=vad.scp
+    if [ -f $data_in/$f ];then
+	cp $data_in/$f $data_out/$f
+    fi
+fi
+
+$cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \
+    hyp_utils/conda_env.sh \
+    preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \
+    --write-time-durs $output_dir/utt2dur.${name}.JOB \
+    --part-idx JOB --num-parts $nj \
+    --output-sampling-rate $osr \
+    --input $data_in/wav.scp \
+    --output-path $output_dir \
+    --output-script $output_dir/wav.${name}.JOB.scp
+
+for n in $(seq $nj); do
+  cat $output_dir/wav.${name}.$n.scp || exit 1;
+done > ${data_out}/wav.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $output_dir/utt2dur.${name}.$n || exit 1;
+done > ${data_out}/utt2dur || exit 1
+
+echo "$0: Succeeded processing audios for $name"
diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py
index 67b1cf61..2698e61f 100755
--- a/hyperion/bin/preprocess_audio_files.py
+++ b/hyperion/bin/preprocess_audio_files.py
@@ -45,6 +45,7 @@ def process_audio_files(
     write_time_durs_spec,
     vad_spec,
     vad_path_prefix,
+    output_sampling_rate,
     vad_fs=100,
     vad_dilation=0,
     vad_erosion=0,
@@ -74,6 +75,10 @@ def process_audio_files(
                 logging.info("Processing audio %s" % (key))
                 t2 = time.time()
 
+                if output_sampling_rate is not None:
+                    x = signal.resample(x, int(x.shape[0]*output_sampling_rate/fs))
+                    fs = output_sampling_rate
+                
                 tot_samples = x.shape[0]
                 if vad_spec is not None:
                     num_vad_frames = int(round(tot_samples * vad_fs / fs))
@@ -95,6 +100,7 @@ def process_audio_files(
                     )
                 )
 
+
                 if x.shape[0] > 0:
                     if remove_dc_offset:
                         x -= np.mean(x)
@@ -148,6 +154,8 @@ def process_audio_files(
     parser.add_argument(
         "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
     )
+    parser.add_argument(
+        "--output-sampling-rate", default=None, type=int, help=("resample output audio"))
 
     parser.add_argument(
         "--vad-fs", default=100, type=float, help=("vad sampling frequency")

From beab75c01ca9dc44bc0143437d29f96f439f6b7e Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login02.rockfish.cluster>
Date: Wed, 1 Feb 2023 00:20:52 -0500
Subject: [PATCH 02/89] update slurm configuration for rockfish

---
 egs/commonvoice/v1/cmd.sh                         |  6 ++++--
 egs/commonvoice/v1/conf/slurm.conf                | 15 +++++++++++++++
 .../v1/conf/wav2vec2xlsr300m_transducer_do.yaml   |  3 ++-
 .../conf/wav2vec2xlsr300m_transducer_do0.2.yaml   |  3 ++-
 .../conf/wav2vec2xlsr300m_transducer_do0.3.yaml   |  3 ++-
 .../conf/wav2vec2xlsr300m_transducer_do0.4.yaml   |  4 +++-
 egs/commonvoice/v1/datapath.sh                    | 11 ++++++-----
 egs/commonvoice/v1/default_config.sh              |  2 +-
 egs/commonvoice/v1/run_001_prepare_data.sh        |  3 +--
 hyp_utils/conda_env.sh                            |  2 +-
 10 files changed, 37 insertions(+), 15 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/slurm.conf

diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh
index 89dbb7d8..6606a180 100755
--- a/egs/commonvoice/v1/cmd.sh
+++ b/egs/commonvoice/v1/cmd.sh
@@ -18,11 +18,13 @@ if [ "$(hostname -d)" == "cm.gemini" ];then
     export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
     #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G"
     #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+elif [ "$(hostname -d)" == "rockfish.cluster" ];then
+    export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G"
+    export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G"
+    export cuda_eval_cmd="$train_cmd"
 else
     export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " 
     export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G"
     export cuda_eval_cmd="$train_cmd"
 fi
 
-
-
diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf
new file mode 100644
index 00000000..11bf450f
--- /dev/null
+++ b/egs/commonvoice/v1/conf/slurm.conf
@@ -0,0 +1,15 @@
+# Default configuration                                                                                                                                                        
+command sbatch --export=PATH
+option name=* --job-name $0
+default time=48:00:00
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=4 --cpus-per-task 4
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 
+option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
index c7fc2df7..19aaac2c 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml
@@ -1,5 +1,6 @@
 hf_feats:
-  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
 transducer:
   decoder:
     embedding_dim: 1024
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
index 1ee4ec72..baa6cde3 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml
@@ -1,5 +1,6 @@
 hf_feats:
-  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
 transducer:
   decoder:
     embedding_dim: 1024
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
index ca7c1995..3a5ff1f5 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml
@@ -1,5 +1,6 @@
 hf_feats:
-  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
 transducer:
   decoder:
     embedding_dim: 1024
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
index 9fed09e7..9c07f5e7 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml
@@ -1,5 +1,7 @@
 hf_feats:
-  pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  #pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
+  #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
 transducer:
   decoder:
     embedding_dim: 1024
diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh
index 4c7987ef..e844d6cd 100644
--- a/egs/commonvoice/v1/datapath.sh
+++ b/egs/commonvoice/v1/datapath.sh
@@ -5,13 +5,14 @@
 
 
 if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
-  librispeech_root=/export/corpora5/LibriSpeech 
+  commonvoice_root=
   musan_root=/export/corpora5/JHU/musan
+  echo "Put your database paths here"
+  exit 1
+elif [ "$(hostname --domain)" == "rockfish.cluster" ];then
+  commonvoice_root=/data/jvillal7/corpora/commonvoice
+  musan_root=/data/jvillal7/corpora/musan
 elif [ "$(hostname --domain)" == "cm.gemini" ];then
-  # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1
-  # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2
-  # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2
-  # musan_root=/expscratch/dgromero/corpora-open/musan
   echo "Put your database paths here"
   exit 1
 else
diff --git a/egs/commonvoice/v1/default_config.sh b/egs/commonvoice/v1/default_config.sh
index 2b6239b6..6f5a2dfb 120000
--- a/egs/commonvoice/v1/default_config.sh
+++ b/egs/commonvoice/v1/default_config.sh
@@ -1 +1 @@
-global_conf/config_transducer_v1.sh
\ No newline at end of file
+global_conf/config_transducer_v3.3_it.sh
\ No newline at end of file
diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh
index d839fac6..6a5a6e10 100755
--- a/egs/commonvoice/v1/run_001_prepare_data.sh
+++ b/egs/commonvoice/v1/run_001_prepare_data.sh
@@ -19,7 +19,6 @@ nj=6
 
 mkdir -p data
 
-commonvoice=/export/c06/ylu125/GSP/corpora/CommonVoice
 
 
 if [ ${stage} -le 1 ]; then
@@ -29,7 +28,7 @@ if [ ${stage} -le 1 ]; then
     for lan in it #sv-SE
     do
       # use underscore-separated names in data directories.
-      local/data_prep.sh ${lan} $commonvoice data/
+      local/data_prep.sh ${lan} $commonvoice_root data/
     done
 fi
 
diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh
index ceee4e93..11b509bb 100755
--- a/hyp_utils/conda_env.sh
+++ b/hyp_utils/conda_env.sh
@@ -65,7 +65,7 @@ if [ $num_gpus -gt 0 ];then
       free_gpu=$(which hyp_utils/free-gpu)
     fi
     
-    if [ ! -z "$free_gpu" ];then
+    if [ ! -z "$free_gpu" ] && [ "$(hostname --domain)" != "rockfish.cluster" ];then
       # if free-gpu found set env var, otherwise we assume that you can use any gpu
       export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus)
     fi

From 046b5f7e88be73acc67ae8f58069397205d74d50 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Wed, 1 Feb 2023 13:32:17 -0500
Subject: [PATCH 03/89] update data preparation for different languge

---
 egs/commonvoice/v1/run_001_prepare_data.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh
index 6a5a6e10..4c0d0297 100755
--- a/egs/commonvoice/v1/run_001_prepare_data.sh
+++ b/egs/commonvoice/v1/run_001_prepare_data.sh
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Data preparation"
-    for lan in it #sv-SE
+    for lan in $language #it sv-SE
     do
       # use underscore-separated names in data directories.
       local/data_prep.sh ${lan} $commonvoice_root data/
@@ -35,7 +35,7 @@ fi
 if [ ${stage} -le 2 ]; then
   echo "stage 2: Data conversion"
   # for part in $test_data $dev_data $nnet_data
-  for lan in it #sv-SE
+  for lan in $language #it sv-SE
   do
     for part in ${lan}_test ${lan}_dev ${lan}_train
     do

From beb2ed5405f71d5cfe29ed62c343a8f6f825196f Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Wed, 15 Feb 2023 17:11:43 -0500
Subject: [PATCH 04/89] update config and add cer scripts

---
 egs/commonvoice/v1/conf/slurm.conf            |  4 +-
 ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml |  4 +-
 ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml |  6 +--
 .../config_transducer_v3.3_en_fr_it.sh        | 41 +++++++++++++++++++
 egs/commonvoice/v1/run_030_inference.sh       |  6 ++-
 .../decode_wav2vec2transducer.sh              |  9 +++-
 .../models/wav2transducer/beam_search.py      |  5 ++-
 7 files changed, 64 insertions(+), 11 deletions(-)
 create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh

diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf
index 11bf450f..262344ea 100644
--- a/egs/commonvoice/v1/conf/slurm.conf
+++ b/egs/commonvoice/v1/conf/slurm.conf
@@ -6,10 +6,10 @@ option time=* --time $0
 option mem=* --mem-per-cpu $0
 option mem=0
 option num_threads=* --cpus-per-task $0
-option num_threads=4 --cpus-per-task 4
+option num_threads=1 --cpus-per-task 1
 option num_nodes=* --nodes $0
 default gpu=0
 option gpu=0 
-option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 1 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
 # note: the --max-jobs-run option is supported as a special case
 # by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
index 564ea8c7..e9fe0b05 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
@@ -13,7 +13,7 @@ data:
       min_batch_size: 1
       drop_last: false
     data_loader:
-      num_workers: 4
+      num_workers: 2
   val:
     dataset:
       aug_cfgs: 
@@ -28,7 +28,7 @@ data:
       min_batch_size: 1
       drop_last: true
     data_loader:
-      num_workers: 4
+      num_workers: 2
 model: wav2vec2xlsr300m_transducer_do0.4.yaml
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
index 8017f9b3..686f9133 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
@@ -13,7 +13,7 @@ data:
       min_batch_size: 1
       drop_last: false
     data_loader:
-      num_workers: 4
+      num_workers: 2
   val:
     dataset:
       aug_cfgs: 
@@ -28,7 +28,7 @@ data:
       min_batch_size: 1
       drop_last: true
     data_loader:
-      num_workers: 4
+      num_workers: 2
 model: 
   transducer:
     decoder:
@@ -56,6 +56,6 @@ trainer:
   epochs: 120
   # eff_batch_size: 1024
   eff_batch_size: 128
-  train_mode: hf-feats-frozen-nograd
+  train_mode: full
 
  
diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh
new file mode 100644
index 00000000..fcb675b8
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh
@@ -0,0 +1,41 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=en_fr_it_train_proc_audio
+dev_data=en_fr_it_dev_proc_audio
+test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio"
+
+language=en_fr_it
+
+bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_transducer_v3.3_en_fr_it
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0022.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh
index 86dccf0a..cf2c8fb2 100755
--- a/egs/commonvoice/v1/run_030_inference.sh
+++ b/egs/commonvoice/v1/run_030_inference.sh
@@ -7,6 +7,8 @@
 . ./path.sh
 set -e
 
+stage=0
+
 config_file=default_config.sh
 use_gpu=false
 nnet_stage=1
@@ -37,10 +39,10 @@ transducer_dir=exp/transducer/$nnet_name
 
 
 # Extracts x-vectors for evaluation
-for name in $dev_data $test_data 
+for name in $test_data  # $dev_data $test_data 
   do
     nj=16
-    steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
+    steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj --stage $stage ${transducer_args} \
       $nnet data/$name \
       $transducer_dir/$name $bpe_model
   done
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
index 143087a5..4a23d9fa 100755
--- a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
+++ b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh
@@ -74,7 +74,14 @@ if [ $stage -le 0 ];then
 fi
 
 if [ $stage -le 1 ];then
-  echo "compute wer"
+  echo "compute wer, cer"
+
   cat $output_dir/transducer.*.text > $output_dir/transducer.text
+
+  python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
+
   compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
+  compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text
+
 fi
diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py
index b23a0769..2550ab3c 100644
--- a/hyperion/torch/models/wav2transducer/beam_search.py
+++ b/hyperion/torch/models/wav2transducer/beam_search.py
@@ -227,6 +227,9 @@ def beam_search(
                 B = B[:beam]
                 break
         t += 1
-    best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:]))
+    try:
+        best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:]))
+    except:
+        return ""
     ys = best_hyp.ys[1:]  # [1:] to remove the blank
     return ys

From ff0fd554f5a0d4908b473cbb59a4bb607c7a7aba Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Wed, 15 Feb 2023 17:16:51 -0500
Subject: [PATCH 05/89] temporal remove data preparation for duration

---
 egs/commonvoice/v1/local/data_prep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/commonvoice/v1/local/data_prep.sh b/egs/commonvoice/v1/local/data_prep.sh
index d68c2368..f21fea8d 100755
--- a/egs/commonvoice/v1/local/data_prep.sh
+++ b/egs/commonvoice/v1/local/data_prep.sh
@@ -28,6 +28,6 @@ do
   lhotse kaldi export ${dst}/${language}/cv-${language}_recordings_${part}.jsonl.gz ${dst}/${language}/cv-${language}_supervisions_${part}.jsonl.gz  ${dst}/${language}_${part}
   utils/utt2spk_to_spk2utt.pl ${dst}/${language}_${part}/utt2spk > ${dst}/${language}_${part}/spk2utt
   utils/fix_data_dir.sh ${dst}/${language}_${part} 
-  steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_}
+  # steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_}
 done
 

From f179db41f27cd393b58adc3352406fdb6cc09dcc Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Wed, 15 Feb 2023 17:40:10 -0500
Subject: [PATCH 06/89] Add combination for multiple languages

---
 .../config_transducer_v3.3_en_fr_it.sh        |  1 +
 egs/commonvoice/v1/run_001_prepare_data.sh    | 23 +++++++++++++++---
 hyp_utils/steps_transducer/word2char.py       | 24 +++++++++++++++++++
 3 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 hyp_utils/steps_transducer/word2char.py

diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh
index fcb675b8..3c8efca9 100644
--- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh
@@ -11,6 +11,7 @@ nnet_data=en_fr_it_train_proc_audio
 dev_data=en_fr_it_dev_proc_audio
 test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio"
 
+lans="en fr it"
 language=en_fr_it
 
 bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model
diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh
index 4c0d0297..7d05ba2c 100755
--- a/egs/commonvoice/v1/run_001_prepare_data.sh
+++ b/egs/commonvoice/v1/run_001_prepare_data.sh
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Data preparation"
-    for lan in $language #it sv-SE
+    for lan in $lans 
     do
       # use underscore-separated names in data directories.
       local/data_prep.sh ${lan} $commonvoice_root data/
@@ -35,15 +35,32 @@ fi
 if [ ${stage} -le 2 ]; then
   echo "stage 2: Data conversion"
   # for part in $test_data $dev_data $nnet_data
-  for lan in $language #it sv-SE
+  for lan in $lans 
   do
     for part in ${lan}_test ${lan}_dev ${lan}_train
     do
       echo ${part}
-      steps_transducer/preprocess_audios_for_nnet_train.sh --nj 20 --cmd "$train_cmd" \
+      steps_transducer/preprocess_audios_for_nnet_train.sh --nj 16 --cmd "$train_cmd" \
       --storage_name commonvoice-v1-$(date +'%m_%d_%H_%M') --use-bin-vad false \
       --osr 16000 data/${part} data/${part}_proc_audio  exp/${part}_proc_audio
       utils/fix_data_dir.sh data/${part}_proc_audio || true
     done
   done
 fi
+
+if [ ${stage} -le 3 ]; then
+  echo "stage 3: Combine Multilingual Data"
+  
+  dev_folders=""
+  train_folders=""
+  for lan in $lans 
+  do
+    dev_folders+="data/${lan}_dev_proc_audio "
+    train_folders+="data/${lan}_train_proc_audio "
+  done 
+  
+  combine_data.sh data/dev_data/ $dev_folders
+  combine_data.sh data/nnet_data/ $train_folders
+
+
+fi
\ No newline at end of file
diff --git a/hyp_utils/steps_transducer/word2char.py b/hyp_utils/steps_transducer/word2char.py
new file mode 100644
index 00000000..062832c4
--- /dev/null
+++ b/hyp_utils/steps_transducer/word2char.py
@@ -0,0 +1,24 @@
+import os
+import sys
+
+word_file = sys.argv[1] # "data/it_test_proc_audio/text"
+char_file = sys.argv[2] # "data/it_test_proc_audio/text_char"
+
+
+# word_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer.text"
+# char_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer_char.text"
+
+output_chars = []
+with open(word_file, "r") as fi:
+    for line in fi.readlines():
+        words = line.split(" ")
+        chars = [words[0]]
+        for wrd in words[1:]:
+            for c in wrd:
+                chars.append(c)
+        output_chars.append(chars)
+
+with open(char_file, "w") as fo:
+    for chars in output_chars:
+        fo.writelines(" ".join(chars))
+

From f816ed366bf6b6bddf9342976bd714b17eb960f8 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sun, 19 Feb 2023 22:10:28 -0500
Subject: [PATCH 07/89] Add language identification task for commonvoice

---
 ...c2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml |  55 +++
 ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml |   4 +-
 ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml |   4 +-
 .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml |  40 ++
 .../global_conf/config_lid_v3.3_en_fr_it.sh   |  42 ++
 egs/commonvoice/v1/run_012_train_lid.sh       | 136 ++++++
 hyperion/bin/train_wav2vec2languageid.py      | 261 ++++++++++++
 hyperion/bin/train_wav2vec2transducer.py      |   6 +-
 hyperion/torch/data/audio_dataset.py          |  20 +
 hyperion/torch/models/__init__.py             |   1 +
 .../torch/models/wav2languageid/__init__.py   |   7 +
 .../wav2languageid/hf_wav2languageid.py       | 391 ++++++++++++++++++
 .../hf_wav2vec2resnet1d_languageid.py         |  99 +++++
 hyperion/torch/trainers/__init__.py           |   2 +
 hyperion/torch/trainers/languageid_trainer.py | 208 ++++++++++
 15 files changed, 1268 insertions(+), 8 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh
 create mode 100755 egs/commonvoice/v1/run_012_train_lid.sh
 create mode 100755 hyperion/bin/train_wav2vec2languageid.py
 create mode 100644 hyperion/torch/models/wav2languageid/__init__.py
 create mode 100644 hyperion/torch/models/wav2languageid/hf_wav2languageid.py
 create mode 100644 hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
 create mode 100644 hyperion/torch/trainers/languageid_trainer.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml
new file mode 100644
index 00000000..afe885a3
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml
@@ -0,0 +1,55 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 2
+      drop_last: false
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'seg_sampler'
+      sampler_type: 'bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 2
+      drop_last: true
+    data_loader:
+      num_workers: 1
+model: wav2vec2xlsr300m_ecapatdnn512x3.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
index e9fe0b05..96e0c4aa 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
@@ -13,7 +13,7 @@ data:
       min_batch_size: 1
       drop_last: false
     data_loader:
-      num_workers: 2
+      num_workers: 1
   val:
     dataset:
       aug_cfgs: 
@@ -28,7 +28,7 @@ data:
       min_batch_size: 1
       drop_last: true
     data_loader:
-      num_workers: 2
+      num_workers: 1
 model: wav2vec2xlsr300m_transducer_do0.4.yaml
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
index 686f9133..88073958 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
@@ -13,7 +13,7 @@ data:
       min_batch_size: 1
       drop_last: false
     data_loader:
-      num_workers: 2
+      num_workers: 1
   val:
     dataset:
       aug_cfgs: 
@@ -28,7 +28,7 @@ data:
       min_batch_size: 1
       drop_last: true
     data_loader:
-      num_workers: 2
+      num_workers: 1
 model: 
   transducer:
     decoder:
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
new file mode 100644
index 00000000..2e7574c2
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
@@ -0,0 +1,40 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 5
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh
new file mode 100644
index 00000000..08a9f950
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=en_fr_it_train_proc_audio
+dev_data=en_fr_it_dev_proc_audio
+test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio"
+
+lans="en fr it"
+language=en_fr_it
+
+bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v3.3_en_fr_it
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0022.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh
new file mode 100755
index 00000000..80948243
--- /dev/null
+++ b/egs/commonvoice/v1/run_012_train_lid.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+ngpu=1
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_dir=data/${nnet_data}/
+val_dir=data/${dev_data}/
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+    extra_args="--data.val.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+if [ "$use_wandb" == "true" ];then
+  extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)"
+fi
+
+
+# Network Training
+if [ $stage -le 1 ]; then
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+    train_wav2vec2languageid.py $nnet_type \
+    --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.language-id-file $train_dir/utt2lang \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $val_dir/langs \
+    --data.val.dataset.language-id-file $val_dir/utt2lang \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s1_dir $args \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+
+fi
+
+if [ $stage -le 2 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2languageid.py $nnet_type \
+    --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.language-id-file $train_dir/utt2lang \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $val_dir/langs \
+    --data.val.dataset.language-id-file $val_dir/utt2lang \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s2_dir $args \
+    --in-model-file $nnet_s1 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+  
+fi
+
+if [ $stage -le 3 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  
+
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2languageid.py $nnet_type \
+    --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.language-id-file $train_dir/utt2lang \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $val_dir/langs \
+    --data.val.dataset.language-id-file $val_dir/utt2lang \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s3_dir $args \
+    --in-model-file $nnet_s2 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+fi
+
diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py
new file mode 100755
index 00000000..093042f6
--- /dev/null
+++ b/hyperion/bin/train_wav2vec2languageid.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import sys
+import os
+from pathlib import Path
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+import multiprocessing
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch.utils import ddp
+from hyperion.torch.trainers import LanguageIDTrainer as Trainer
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID
+from torch.nn.utils.rnn import pad_sequence
+
+model_dict = {
+    "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID,
+    # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID,
+    # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID,
+}
+
+
+def Language_collate(batch):
+    audio = []
+    audio_length = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record[0])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        language.append(record[1])
+    audio = pad_sequence(audio)
+    audio_length = torch.as_tensor(audio_length)
+    language = torch.as_tensor(language)
+    
+    return torch.transpose(audio, 0, 1), audio_length, language
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = ({
+        "num_workers": num_workers_per_gpu,
+        "pin_memory": True
+    } if num_gpus > 0 else {})
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=sampler,
+                                              **largs,
+                                              collate_fn=Language_collate)
+    return data_loader
+
+
+def init_model(num_classes, rank, model_class, **kwargs):
+    model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network args={}".format(model_args))
+    model_args["languageid"]["num_classes"] = num_classes
+    model = model_class(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+
+    model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train",
+                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str,
+    )
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
+    parser.add_argument("--data.train.dataset.language_id_file", type=str)
+    parser.add_argument("--data.val.dataset.language_id_file", type=str)
+
+
+    parser.add_argument(
+        "--data.train.dataset.class_files",
+        type=str,
+    )
+
+
+    parser.add_argument(
+        "--data.dev.dataset.class_files",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--data.train.dataset.class_names",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--data.dev.dataset.class_names",
+        type=str,
+    )
+
+
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str,
+    )
+
+    parser.link_arguments("data.train.data_loader.num_workers",
+                          "data.val.data_loader.num_workers")
+
+    model_class.add_class_args(parser, prefix="model")
+    Trainer.add_class_args(parser,
+                           prefix="trainer",
+                           train_modes=model_class.valid_train_modes())
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1123581321,
+                        help="random seed")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Train Wav2Vec2Language model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py
index ee60080a..cb96c0f6 100755
--- a/hyperion/bin/train_wav2vec2transducer.py
+++ b/hyperion/bin/train_wav2vec2transducer.py
@@ -107,9 +107,6 @@ def train_model(gpu_id, args):
     kwargs = namespace_to_dict(args)
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
-    #torch.backends.cudnn.deterministic = True
-    #torch.backends.cudnn.benchmark = False
-    torch.backends.cudnn.enabled = False
 
     ddp_args = ddp.filter_ddp_args(**kwargs)
     device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
@@ -215,7 +212,8 @@ def make_parser(model_class):
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
-
+    print("cuda available:", torch.cuda.is_available())
+    logging.info("cuda available: {}".format(torch.cuda.is_available()))
     for k, v in model_dict.items():
         parser_k = make_parser(v)
         subcommands.add_subcommand(k, parser_k)
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 721c7a1f..3bfa328b 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -467,6 +467,7 @@ def __init__(
         bpe_model=None,
         text_file=None,
         time_durs_file=None,
+        language_id_file=None,
         aug_cfgs=None,
         num_augs=1,
         return_segment_info=None,
@@ -512,7 +513,15 @@ def __init__(
         else:
             assert "duration" in self.seg_set
 
+        if language_id_file is not None:
+            if rank == 0:
+                logging.info("loading language id file %s" % language_id_file)
+
+            language_ids = SegmentSet.load(language_id_file)
+            self.seg_set["language"] = language_ids.loc[self.seg_set["id"]].class_id
+
         logging.info("loading class-info files")
+        
         self._load_class_infos(class_names, class_files, is_val)
 
 
@@ -523,6 +532,8 @@ def __init__(
         if text_file is not None:
             logging.info("loading text files")
             self._load_text_infos(text_file, is_val)
+
+            
         self.return_segment_info = (
             [] if return_segment_info is None else return_segment_info
         )
@@ -764,6 +775,7 @@ def filter_args(**kwargs):
             "return_segment_info",
             "return_orig",
             "time_durs_file",
+            "language_id_file",
             "target_sample_freq",
         )
         args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
@@ -807,6 +819,14 @@ def add_class_args(parser, prefix=None, skip={}):
             ),
         )
 
+        parser.add_argument(
+            "--language-id-file",
+            default=None,
+            help=(
+                "file with language ids for each utterance"
+            ),
+        )
+
         parser.add_argument(
             "--bpe-model",
             default=None,
diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py
index 44ff171d..21fe7e6f 100644
--- a/hyperion/torch/models/__init__.py
+++ b/hyperion/torch/models/__init__.py
@@ -20,6 +20,7 @@
 
 
 from .wav2transducer import HFWav2Vec2Transducer
+from .wav2languageid import HFWav2Vec2ResNet1dLanguageID
 
 from .vae.vae import VAE
 from .vae.vq_vae import VQVAE
diff --git a/hyperion/torch/models/wav2languageid/__init__.py b/hyperion/torch/models/wav2languageid/__init__.py
new file mode 100644
index 00000000..849a30a6
--- /dev/null
+++ b/hyperion/torch/models/wav2languageid/__init__.py
@@ -0,0 +1,7 @@
+"""
+ Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""
+
+from .hf_wav2vec2resnet1d_languageid import HFWav2Vec2ResNet1dLanguageID
\ No newline at end of file
diff --git a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py
new file mode 100644
index 00000000..22974afe
--- /dev/null
+++ b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py
@@ -0,0 +1,391 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import contextlib
+from jsonargparse import ArgumentParser, ActionParser
+
+import torch
+import torch.nn as nn
+
+# import torch.nn.functional as nnf
+
+from ...torch_model import TorchModel
+from ...utils import remove_silence
+
+
+class HFWav2LanguageID(TorchModel):
+    """Abstract Base class for language identification models that use a Hugging Face Model as feature extractor.
+
+    Attributes:
+       hf_feats: hugging face model wrapper object.
+       languageid: language identification model object.
+       feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to
+                          the wav2vec "num_layers".
+       feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(
+        self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_method="weighted-avg"
+    ):
+
+        super().__init__()
+        self.hf_feats = hf_feats
+        self.languageid = languageid
+        self.feat_fusion_start = feat_fusion_start
+        self.feat_fusion_method = feat_fusion_method
+        self._hf_context = contextlib.nullcontext()
+        self._make_fuser()
+
+    def _make_fuser(self):
+        if self.feat_fusion_method == "last":
+            self.feat_fuser = None
+            return
+
+        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        layer_dim = self.hf_feats.hidden_size
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif self.feat_fusion_method == "linear":
+            self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
+            self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers
+        elif self.feat_fusion_method == "cat":
+            self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False)
+
+    def _fuse_hid_feats(self, hid_feats):
+        """Fuses the hidden features from the Wav2Vec model.
+
+        Args:
+          hid_feats: list of hidden features Tensors from Wav2Vec model.
+
+        Returns:
+          Tensor of fused features (batch, channels, time)
+        """
+        if len(hid_feats) == 1:
+            # There is only one layer of features
+            return hid_feats[0]
+
+        hid_feats = hid_feats[self.feat_fusion_start :]
+        if self.feat_fusion_method == "weighted-avg":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method == "linear":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method == "cat":
+            hid_feats = torch.cat(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats)
+        elif self.feat_fusion_method == "last":
+            feats = hid_feats[-1]
+
+        return feats
+
+    @property
+    def sample_frequency(self):
+        return self.hf_feats.sample_frequency
+
+    def compute_prototype_affinity(self):
+        return self.languageid.compute_prototype_affinity()
+
+    def update_loss_margin(self, epoch):
+        """Updates the value of the margin in AAM/AM-softmax losses
+           given the epoch number
+
+        Args:
+          epoch: epoch which is about to start
+        """
+        self.languageid.update_loss_margin(epoch)
+
+    def rebuild_output_layer(
+        self,
+        num_classes=None,
+        loss_type="arc-softmax",
+        cos_scale=64,
+        margin=0.3,
+        margin_warmup_epochs=10,
+        intertop_k=5,
+        intertop_margin=0.0,
+        num_subcenters=2,
+    ):
+        self.languageid.rebuild_output_layer(
+            num_classes=num_classes,
+            loss_type=loss_type,
+            cos_scale=cos_scale,
+            margin=margin,
+            margin_warmup_epochs=margin_warmup_epochs,
+            intertop_k=intertop_k,
+            intertop_margin=intertop_margin,
+            num_subcenters=num_subcenters,
+        )
+
+    def forward_feats(
+        self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False
+    ):
+        return_hid_states = (
+            False
+            if return_feat_layers is None and self.feat_fusion_method == "last"
+            else True
+        )
+        with self._hf_context:
+            hf_output = self.hf_feats(
+                x,
+                x_lengths,
+                return_hid_states=return_hid_states,
+                chunk_length=chunk_length,
+                detach_chunks=detach_chunks,
+            )
+        feat_lengths = hf_output["hidden_states_lengths"]
+        if return_hid_states:
+            hid_feats = hf_output["hidden_states"]
+            feats = self._fuse_hid_feats(hid_feats)
+        else:
+            hid_feats = None
+            feats = hf_output["last_hidden_state"]
+
+        feats = feats.transpose(1, 2)
+        if return_feat_layers is not None:
+            # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time)
+            # as the hidden features of the language identification encoder.
+            hid_feats = [
+                f.transpose(1, 2)
+                for i, f in enumerate(hid_feats)
+                if i in return_feat_layers
+            ]
+        else:
+            hid_feats = None
+
+        return feats, hid_feats, feat_lengths
+
+    def forward(
+        self,
+        x,
+        x_lengths=None,
+        y=None,
+        return_feat_layers=None,
+        return_enc_layers=None,
+        return_classif_layers=None,
+        return_logits=True,
+    ):
+        """Forward function. If returns the logits posteriors of the classes.
+        It can also returns the hidden representations in the wav2vec feature extractor,
+        the language identification encoder and the
+        classification head. In this case the ouput variable is a dictionary.
+
+        Args:
+          x: input features tensor with shape=(batch, in_feats, time)
+          x_lengths: time lengths of the features with shape=(batch,)
+          y: target classes torch.long tensor with shape=(batch,)
+          return_feat_layers: list of integers indicating, which wav2vec layers
+                             we should return. If None, no wav2vec layers are returned.
+          return_enc_layers: list of integers indicating, which encoder layers
+                             we should return. If None, no encoder layers are returned.
+          return_enc_layers: list of integers indicating, which classification head layers
+                             we should return. If None, no head layers are returned.
+          return_logits: if True, it adds the logits to the output dictionary.
+        Returns:
+          Tensor with class logits with shape=(batch, num_classes) or
+          Dictionary with "logits", "h_enc" (list of hidden encoder layers),
+          "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features)
+        """
+        feats, hid_feats, feat_lengths = self.forward_feats(
+            x, x_lengths, return_feat_layers
+        )
+        output = self.languageid(
+            feats,
+            feat_lengths,
+            y,
+            return_enc_layers=return_enc_layers,
+            return_classif_layers=return_classif_layers,
+            return_logits=return_logits,
+        )
+
+        if not return_feat_layers:
+            return output
+
+        if not isinstance(output, dict):
+            # if the languageid just returned the logits we put then into a dictionary
+            # to append the hid feats later.
+            output["logits"] = output
+
+        output["h_feats"] = hid_feats
+        return output
+
+    def extract_embed(
+        self,
+        x,
+        x_lengths=None,
+        vad_samples=None,
+        hf_chunk_length=0,
+        xvec_chunk_length=0,
+        embed_layer=None,
+        detach_chunks=False,
+    ):
+
+        if vad_samples is not None:
+            x, x_lengths = remove_silence(x, x_lengths)
+
+        feats, _, feat_lengths = self.forward_feats(
+            x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks
+        )
+        xvec_chunk_length = int(
+            xvec_chunk_length
+            * self.hf_feats.sample_frequency
+            * feats.size(-1)
+            // x.size(-1)
+        )
+        return self.languageid.extract_embed(
+            feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks
+        )
+
+    def freeze_feat_fuser(self):
+        if self.feat_fuser is None:
+            return
+
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser.requires_grad = False
+            return
+
+        for param in self.feat_fuser.parameters():
+            param.requires_grad = False
+
+    def freeze_hf_feats(self):
+        self.hf_feats.freeze()
+
+    def freeze_hf_feature_encoder(self):
+        self.hf_feats.freeze_feature_encoder()
+
+    def set_train_mode(self, mode):
+        if mode == self._train_mode:
+            return
+
+        if mode == "full":
+            self.unfreeze()
+        elif mode == "frozen":
+            self.freeze()
+        elif mode == "ft-embed-affine":
+            self.unfreeze()
+            self.freeze_feat_fuser()
+            self.freeze_hf_feats()
+            self.languageid.freeze_preembed_layers()
+        elif mode in ["ft-languageid", "ft-languageid-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+            self.freeze_feat_fuser()
+        elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+        elif mode == "hf-feat-extractor-frozen":
+            self.unfreeze()
+            self.freeze_hf_feature_encoder()
+        else:
+            raise ValueError(f"invalanguageid train_mode={mode}")
+
+        logging.info("train mode set to %s", mode)
+
+        if "nograd" in mode:
+            logging.info("using torch.no_grad for hf_feats")
+            self._hf_context = torch.no_grad()
+        else:
+            self._hf_context = contextlib.nullcontext()
+
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+
+        if train_mode in ["full", "frozen"]:
+            super()._train(train_mode)
+        elif train_mode == "ft-embed-affine":
+            self.hf_feats.train()
+            self.languageid._train("ft-embed_affine")
+        elif train_mode in [
+            "ft-languageid",
+            "hf-feats-frozen",
+            "ft-languageid-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
+        ]:
+            self.hf_feats.train()
+            self.languageid._train("full")
+        else:
+            raise ValueError(f"invalanguageid train_mode={train_mode}")
+
+    @staticmethod
+    def valanguageid_train_modes():
+        return [
+            "full",
+            "frozen",
+            "ft-embed-affine",
+            "ft-languageid",
+            "hf-feats-frozen",
+            "ft-languageid-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
+        ]
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valanguageid_args = (
+            "hf_feats",
+            "languageid",
+            "feat_fusion_start",
+            "feat_fusion_method",
+        )
+        args = dict((k, kwargs[k]) for k in valanguageid_args if k in kwargs)
+        return args
+
+    def get_config(self):
+
+        hf_cfg = self.hf_feats.get_config()
+        xvec_cfg = self.languageid.get_config()
+        del hf_cfg["class_name"]
+        del xvec_cfg["class_name"]
+        config = {
+            "hf_feats": hf_cfg,
+            "languageid": xvec_cfg,
+            "feat_fusion_start": self.feat_fusion_start,
+            "feat_fusion_method": self.feat_fusion_method,
+        }
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def change_config(self, hf_feats, languageid):
+        logging.info("changing hf wav2xvector config")
+        self.hf_feats.change_config(**hf_feats)
+        self.languageid.change_config(**languageid)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--feat-fusion-start",
+            default=0,
+            type=int,
+            help=(
+                "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to"
+                "the wav2vec num_layers"
+            ),
+        )
+        parser.add_argument(
+            "--feat-fusion-method",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=(
+                "method to fuse the hidden layers from the wav2vec model "
+                "in [weighted-avg, cat]"
+            ),
+        )
+
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix,
+                action=ActionParser(parser=parser),
+                help="languageid options",
+            )
diff --git a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
new file mode 100644
index 00000000..d357cd87
--- /dev/null
+++ b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
@@ -0,0 +1,99 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from jsonargparse import ArgumentParser, ActionParser
+from typing import Union, Dict, Optional
+
+import torch
+import torch.nn as nn
+
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
+from ...tpm import HFWav2Vec2
+from .hf_wav2languageid import HFWav2LanguageID
+
+
+class HFWav2Vec2ResNet1dLanguageID(HFWav2LanguageID):
+    """Class extracting Wav2Vec2 + ResNet1d language identifications from waveform.
+
+    Attributes:
+      Attributes:
+      hf_feats: HFWav2Vec configuration dictionary or object.
+                This is a warpper over Hugging Face Wav2Vec model.
+      languageid: ResNet1dLanguageID configuration dictionary or object.
+      feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to
+                         the wav2vec "num_layers".
+      feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(
+        self,
+        hf_feats: Union[Dict, HFWav2Vec2],
+        languageid: Union[Dict, ResNet1dLanguageID],
+        feat_fusion_start: int = 0,
+        feat_fusion_method: str = "weighted-avg",
+    ):
+
+        if isinstance(hf_feats, dict):
+            if "class_name" in hf_feats:
+                del hf_feats["class_name"]
+            hf_feats = HFWav2Vec2(**hf_feats)
+        else:
+            assert isinstance(hf_feats, HFWav2Vec2)
+
+        if isinstance(languageid, dict):
+            languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in languageid:
+                del languageid["class_name"]
+            languageid = ResNet1dLanguageID(**languageid)
+        else:
+            assert isinstance(languageid, ResNet1dLanguageID)
+            assert languageid.encoder_net.in_feats == hf_feats.hidden_size
+
+        super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_method)
+
+    @staticmethod
+    def filter_args(**kwargs):
+
+        base_args = HFWav2LanguageID.filter_args(**kwargs)
+        child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"])
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_class_args(parser, prefix="hf_feats")
+        ResNet1dLanguageID.add_class_args(parser, prefix="languageid")
+        HFWav2LanguageID.add_class_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+        child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"])
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats")
+        ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py
index 593cfa1f..5db38bf7 100644
--- a/hyperion/torch/trainers/__init__.py
+++ b/hyperion/torch/trainers/__init__.py
@@ -6,6 +6,8 @@
 from .torch_trainer import TorchTrainer
 
 
+
+from .languageid_trainer import LanguageIDTrainer
 from .transducer_trainer import TransducerTrainer
 
 from .xvector_trainer import XVectorTrainer
diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py
new file mode 100644
index 00000000..3a65bfde
--- /dev/null
+++ b/hyperion/torch/trainers/languageid_trainer.py
@@ -0,0 +1,208 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import os
+from collections import OrderedDict as ODict
+
+import logging
+
+import torch
+import torchaudio
+import torch.nn as nn
+
+from ..utils import MetricAcc
+from .torch_trainer import TorchTrainer
+from torch.distributed.elastic.multiprocessing.errors import record
+
+
+class LanguageIDTrainer(TorchTrainer):
+    """Trainer to train Language identification style models.
+
+    Attributes:
+      model: Language identification model object.
+      optim: pytorch optimizer object or options dict
+      epochs: max. number of epochs
+      exp_path: experiment output path
+      cur_epoch: current epoch
+      grad_acc_steps: gradient accumulation steps to simulate larger batch size.
+      device: cpu/gpu device
+      metrics: extra metrics to compute besides cxe.
+      lrsched: learning rate scheduler object or options dict
+      loggers: LoggerList object, loggers write training progress to std. output and file.
+               If None, it uses default loggers.
+      ddp: if True use distributed data parallel training
+      ddp_type: type of distributed data parallel in  (ddp, oss_ddp, oss_shared_ddp)
+      loss: if None, it uses cross-entropy
+      train_mode: training mode in ['train', 'ft-full', 'ft-last-layer']
+      use_amp: uses mixed precision training.
+      log_interval: number of optim. steps between log outputs
+      use_tensorboard: use tensorboard logger
+      use_wandb: use wandb logger
+      wandb: wandb dictionary of options
+      grad_clip: norm to clip gradients, if 0 there is no clipping
+      grad_clip_norm: norm type to clip gradients
+      swa_start: epoch to start doing swa
+      swa_lr: SWA learning rate
+      swa_anneal_epochs: SWA learning rate anneal epochs
+      cpu_offload: CPU offload of gradients when using fully sharded ddp
+    """
+    def __init__(
+        self,
+        model,
+        optim={},
+        epochs=100,
+        exp_path="./train",
+        cur_epoch=0,
+        grad_acc_steps=1,
+        eff_batch_size=None,
+        device=None,
+        metrics=None,
+        lrsched=None,
+        loggers=None,
+        ddp=False,
+        ddp_type="ddp",
+        loss=None,
+        train_mode="full",
+        use_amp=False,
+        log_interval=10,
+        use_tensorboard=False,
+        use_wandb=False,
+        wandb={},
+        grad_clip=0,
+        grad_clip_norm=2,
+        swa_start=0,
+        swa_lr=1e-3,
+        swa_anneal_epochs=10,
+        cpu_offload=False,
+    ):
+
+        if loss is None:
+            loss = nn.CrossEntropyLoss()
+        super().__init__(
+            model,
+            loss,
+            optim,
+            epochs,
+            exp_path,
+            cur_epoch=cur_epoch,
+            grad_acc_steps=grad_acc_steps,
+            eff_batch_size=eff_batch_size,
+            device=device,
+            metrics=metrics,
+            lrsched=lrsched,
+            loggers=loggers,
+            ddp=ddp,
+            ddp_type=ddp_type,
+            train_mode=train_mode,
+            use_amp=use_amp,
+            log_interval=log_interval,
+            use_tensorboard=use_tensorboard,
+            use_wandb=use_wandb,
+            wandb=wandb,
+            grad_clip=grad_clip,
+            grad_clip_norm=grad_clip_norm,
+            swa_start=swa_start,
+            swa_lr=swa_lr,
+            swa_anneal_epochs=swa_anneal_epochs,
+            cpu_offload=cpu_offload,
+        )
+
+    @record
+    def train_epoch(self, data_loader):
+        """Training epoch loop
+
+        Args:
+          data_loader: pytorch data loader returning features and class labels.
+        """
+
+        self.model.update_loss_margin(self.cur_epoch)
+
+        metric_acc = MetricAcc(device=self.device)
+        batch_metrics = ODict()
+        self.model.train()
+
+        for batch, (data, audio_length, target) in enumerate(data_loader):
+            self.loggers.on_batch_begin(batch)
+
+            if batch % self.grad_acc_steps == 0:
+                self.optimizer.zero_grad()
+            data, audio_length, target = data.to(self.device), audio_length.to(
+                self.device), target.to(self.device)
+            batch_size = data.shape[0]
+
+            with self.amp_autocast():
+                # TODO: Check and Modify output, loss from the model
+                # output, loss = self.model(data,
+                #                           x_lengths=audio_length,
+                #                           y=target)
+                # loss = loss.mean() / self.grad_acc_steps
+                output = self.model(data, y=target)
+                loss = self.loss(output, target).mean() / self.grad_acc_steps
+
+            if self.use_amp:
+                self.grad_scaler.scale(loss).backward()
+            else:
+                loss.backward()
+
+            if (batch + 1) % self.grad_acc_steps == 0:
+                if self.lr_scheduler is not None and not self.in_swa:
+                    self.lr_scheduler.on_opt_step()
+                self.update_model()
+
+            batch_metrics["loss"] = loss.item() * self.grad_acc_steps
+            for k, metric in self.metrics.items():
+                batch_metrics[k] = metric(output, target)
+
+            metric_acc.update(batch_metrics, batch_size)
+            logs = metric_acc.metrics
+            logs["lr"] = self._get_lr()
+            self.loggers.on_batch_end(logs=logs, batch_size=batch_size)
+
+        logs = metric_acc.metrics
+        logs = ODict(("train_" + k, v) for k, v in logs.items())
+        logs["lr"] = self._get_lr()
+        return logs
+
+    # def validation_epoch(self, data_loader, swa_update_bn=False):
+    #     """Validation epoch loop
+
+    #     Args:
+    #       data_loader: PyTorch data loader return input/output pairs.
+    #       sw_update_bn: wheter or not, update batch-norm layers in SWA.
+    #     """
+
+    #     metric_acc = MetricAcc(self.device)
+    #     batch_metrics = ODict()
+    #     with torch.no_grad():
+    #         if swa_update_bn:
+    #             log_tag = "train_"
+    #             self.train()
+    #         else:
+    #             log_tag = "val_"
+    #             self.model.eval()
+
+    #         for batch, (data, audio_length, target) in enumerate(data_loader):
+    #             data, audio_length, target = data.to(
+    #                 self.device), audio_length.to(self.device), target.to(
+    #                     self.device)
+    #             batch_size = data.shape[0]
+    #             # data, target = data.to(self.device), target.to(self.device)
+    #             # batch_size = data.shape[0]
+
+    #             with self.amp_autocast():
+    #                 output, loss = self.model(data,
+    #                                           x_lengths=audio_length,
+    #                                           y=target)
+    #                 # output = self.model(data)
+    #                 # loss = self.loss(output, target)
+
+    #             batch_metrics["loss"] = loss.mean().item()
+    #             for k, metric in self.metrics.items():
+    #                 batch_metrics[k] = metric(output, target)
+
+    #             metric_acc.update(batch_metrics, batch_size)
+
+    #     logs = metric_acc.metrics
+    #     logs = ODict((log_tag + k, v) for k, v in logs.items())
+    #     return logs

From b524b8491b7f4dac8c1b9a04a5db486d97c414d1 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Thu, 23 Mar 2023 20:36:56 -0400
Subject: [PATCH 08/89] Add Class Weighted Sampler for ASR and utterance-wise
 LID

---
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml |  67 +++
 .../v1/global_conf/config_lid_v2.0_13langs.sh |  44 ++
 .../class_weighted_bucketing_seg_sampler.py   | 251 +++++++++++
 .../torch/data/class_weighted_seg_sampler.py  | 392 ++++++++++++++++++
 hyperion/torch/data/seg_sampler_factory.py    |  13 +-
 hyperion/torch/trainers/languageid_trainer.py |  87 ++--
 6 files changed, 809 insertions(+), 45 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh
 create mode 100644 hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
 create mode 100644 hyperion/torch/data/class_weighted_seg_sampler.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..c06e46e8
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,67 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 2
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.5
+
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 2
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.5
+    data_loader:
+      num_workers: 1
+model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh
new file mode 100644
index 00000000..851cbc18
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v2.0_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0022.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
new file mode 100644
index 00000000..94943ccc
--- /dev/null
+++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
@@ -0,0 +1,251 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import math
+from jsonargparse import ArgumentParser, ActionParser
+import logging
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+
+from .hyp_sampler import HypSampler
+from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler
+
+
+class ClassWeightedRandomBucketingSegSampler(HypSampler):
+    def __init__(self,
+                 seg_set,
+                 class_info,
+                 base_sampler=ClassWeightedRandomSegSampler,
+                 num_buckets=10,
+                 length_column="duration",
+                 weight_exponent=1.0,
+                 weight_mode="custom",
+                 seg_weight_mode="uniform",
+                 class_name="language",
+                 seed=1234,
+                 **base_kwargs):
+        super().__init__(shuffle=False, seed=seed)
+        self.class_name = class_name
+        self.seg_set = seg_set
+        self.class_info = class_info
+        self.base_sampler = base_sampler
+        self.base_kwargs = base_kwargs
+        self.base_kwargs["seed"] = seed
+        self.num_buckets = num_buckets
+        self.length_column = length_column
+        self.weight_exponent = weight_exponent
+        self.weight_mode = weight_mode
+        self.seg_weight_mode = seg_weight_mode
+        self._gather_class_info()
+        self._set_class_weights()
+        self._create_bucket_samplers()
+        self._compute_len()
+        self.depleted_buckets = torch.zeros((num_buckets, ), dtype=torch.bool)
+
+    def create_buckets(self):
+        # class_ids = self._sample_classes()
+        sort_idx = np.argsort(self.seg_set[self.length_column].values)
+        sorted_seg_set = self.seg_set.iloc[sort_idx]
+        cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values,
+                                axis=0)
+        bucket_length = cum_lengths[-1] / self.num_buckets
+        buckets = []
+        for i in range(self.num_buckets):
+            # logging.info("self.seg_set", self.seg_set.get_col_idx(self.length_column))
+            # logging.info("sorted_seg_set", sorted_seg_set.get_col_idx(self.length_column))
+            bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0)
+            bucket_i = sorted_seg_set.loc[bucket_idx]
+            # logging.info("bucket_i", bucket_i.get_col_idx(self.length_column))
+            buckets.append(bucket_i)
+            cum_lengths -= bucket_length
+
+        return buckets
+
+    def _create_bucket_samplers(self):
+        buckets = self.create_buckets()
+        bucket_samplers = []
+        for i in range(self.num_buckets):
+            sampler_i = self.base_sampler(buckets[i],
+                 self.class_info,
+                #  weight_exponent=self.weight_exponent,
+                #  weight_mode=self.weight_mode,
+                 seg_weight_mode=self.seg_weight_mode,
+                 class_name=self.class_name, 
+                 **self.base_kwargs)
+            bucket_samplers.append(sampler_i)
+
+        self.bucket_samplers = bucket_samplers
+
+    def __len__(self):
+        return self._len
+
+    def _gather_class_info(self):
+        # we get some extra info that we need for the classes.
+
+        # we need the maximum/minimum segment duration for each class.
+        total_dur = np.zeros(len(self.class_info))
+        for i, c in enumerate(self.class_info["id"]):
+            seg_idx = self.seg_set[self.class_name] == c
+            if seg_idx.sum() > 0:
+                durs_i = self.seg_set.loc[seg_idx, self.length_column]
+                total_dur[i] = durs_i.sum()
+            else:
+                total_dur[i] = 0
+
+        self.class_info["total_duration"] = total_dur
+        # logging.info("total_duration", self.class_info["total_duration"])
+
+        # we need the mapping from class index to id
+        self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]]
+        self.map_class_idx_to_ids.set_index("class_idx", inplace=True)
+
+    def _set_class_weights(self):
+        # logging.info("setting class weights")
+        # logging.info(f'weight mode:{self.weight_mode}')
+        # logging.info(f'weight exponent:{self.weight_exponent}')
+        # import pdb; pdb.set_trace()
+        if self.weight_mode == "uniform":
+            self.class_info.set_uniform_weights()
+        elif self.weight_mode == "data-prior":
+            weights = self.class_info["total_duration"].values
+            self.class_info.set_weights(weights)
+            logging.info(f'data-prior weight:{self.class_info["weights"]}')
+
+        if self.weight_exponent != 1.0:
+            self.class_info.exp_weights(self.weight_exponent)
+        logging.info(f'weight_exponent weight:{self.class_info["weights"]}')
+        
+
+    def _compute_len(self):
+        self._len = 0
+        for i in range(self.num_buckets):
+            self._len += len(self.bucket_samplers[i])
+
+    def set_epoch(self, epoch):
+        for i in range(self.num_buckets):
+            self.bucket_samplers[i].set_epoch(epoch)
+
+    def __iter__(self):
+        super().__iter__()
+        self.depleted_buckets[:] = False
+        for i in range(self.num_buckets):
+            self.bucket_samplers[i].__iter__()
+
+        return self
+
+    def all_buckets_depleted(self):
+        return torch.all(self.depleted_buckets).item()
+
+    def __next__(self):
+
+        if self.batch == self._len or self.all_buckets_depleted():
+            raise StopIteration
+
+        while True:
+            bucket_idx = torch.randint(low=0,
+                                       high=self.num_buckets,
+                                       size=(1, ),
+                                       generator=self.rng).item()
+            if self.depleted_buckets[bucket_idx]:
+                continue
+
+            bucket = self.bucket_samplers[bucket_idx]
+            try:
+                batch = next(bucket)
+                break
+            except StopIteration:
+                self.depleted_buckets[bucket_idx] = True
+                if self.all_buckets_depleted():
+                    raise StopIteration()
+
+        if self.batch == 0:
+            logging.info("batch 0 chunks=%s", str(batch[:10]))
+
+        self.batch += 1
+        return batch
+
+    @property
+    def avg_batch_size(self):
+        avg_batch_size = 0
+        for sampler in self.bucket_samplers:
+            avg_batch_size += sampler.avg_batch_size
+
+        avg_batch_size /= self.num_buckets
+        return avg_batch_size
+
+    @staticmethod
+    def filter_args(**kwargs):
+
+        valid_args = (
+            "num_buckets",
+            "length_column",
+            "weight_exponent",
+            "weight_mode",
+            "seg_weight_mode",
+            "class_name",
+            "length_column",
+            "shuffle",
+            "seed",
+        )
+
+        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+
+        parser.add_argument(
+            "--weight-exponent",
+            default=1.0,
+            type=float,
+            help=("exponent for class weights"),
+        )
+        parser.add_argument(
+            "--weight-mode",
+            default="custom",
+            choices=["custom", "uniform", "data-prior"],
+            help=("method to get the class weights"),
+        )
+
+        parser.add_argument(
+            "--seg-weight-mode",
+            default="uniform",
+            choices=["uniform", "data-prior"],
+            help=("method to sample segments given a class"),
+        )
+
+        parser.add_argument(
+            "--shuffle",
+            action=ActionYesNo,
+            help="shuffles the segments or chunks at the beginning of the epoch",
+        )
+
+        parser.add_argument(
+            "--seed",
+            type=int,
+            default=1234,
+            help=("seed for sampler random number generator"),
+        )
+
+        parser.add_argument(
+            "--length-column",
+            default="duration",
+            help="which column in the segment table indicates the duration of the segment",
+        )
+        parser.add_argument(
+            "--class-name",
+            default="class_id",
+            help="which column in the segment table indicates the class of the segment",
+        )
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py
new file mode 100644
index 00000000..09a34591
--- /dev/null
+++ b/hyperion/torch/data/class_weighted_seg_sampler.py
@@ -0,0 +1,392 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import math
+from jsonargparse import ArgumentParser, ActionParser, ActionYesNo
+import logging
+
+import numpy as np
+
+import torch
+from .hyp_sampler import HypSampler
+
+
+def get_loc(seg_set, keys):
+    if isinstance(keys, (list, np.ndarray)):
+        return seg_set.index.get_indexer(keys)
+
+    loc = seg_set.index.get_loc(keys)
+    if isinstance(loc, int):
+        return loc
+    elif isinstance(loc, np.ndarray) and loc.dtype == np.bool:
+        return np.nonzero(loc)[0]
+    else:
+        return list(range(loc.start, loc.stop, loc.step))
+
+class ClassWeightedRandomSegSampler(HypSampler):
+    def __init__(
+        self,
+        seg_set,
+        class_info,
+        min_batch_size=1,
+        max_batch_size=None,
+        max_batch_length=None,
+        length_name="duration",
+        shuffle=False,
+        drop_last=False,
+        # weight_exponent=1.0,
+        # weight_mode="custom",
+        seg_weight_mode="uniform",
+        num_segs_per_class=1,
+        class_name="class_id",
+        seed=1234,
+    ):
+        super().__init__(shuffle=shuffle, seed=seed)
+        self.class_info = class_info
+        # self.weight_exponent=weight_exponent
+        # self.weight_mode=weight_mode
+        self.seg_weight_mode = seg_weight_mode
+        self.num_segs_per_class = num_segs_per_class
+        self.class_name=class_name
+        self.seg_set = seg_set
+        self.min_batch_size = min_batch_size
+        self.max_batch_size = max_batch_size
+        self.max_batch_length = max_batch_length
+        self.var_batch_size = max_batch_length is not None
+        self.length_name = length_name
+        if self.var_batch_size:
+            avg_batch_size = max_batch_length / np.mean(
+                self.seg_set[self.length_name])
+        else:
+            avg_batch_size = min_batch_size
+
+        self.avg_batch_size = avg_batch_size
+
+        if drop_last:
+            self._len = int(
+                len(self.seg_set) / (avg_batch_size * self.world_size))
+        else:
+            self._len = int(
+                math.ceil(
+                    (len(self.seg_set) // self.world_size) / avg_batch_size))
+
+        self._gather_class_info()
+        self._permutation = None
+
+
+    def _gather_class_info(self):
+        # we get some extra info that we need for the classes.
+
+        # we need the maximum/minimum segment duration for each class.
+        max_dur = np.zeros(len(self.class_info))
+        min_dur = np.zeros(len(self.class_info))
+        total_dur = np.zeros(len(self.class_info))
+        for i, c in enumerate(self.class_info["id"]):
+            seg_idx = self.seg_set[self.class_name] == c
+            if seg_idx.sum() > 0:
+                durs_i = self.seg_set.loc[seg_idx, self.length_name]
+                max_dur[i] = durs_i.max()
+                min_dur[i] = durs_i.min()
+                total_dur[i] = durs_i.sum()
+            else:
+                max_dur[i] = min_dur[i] = total_dur[i] = 0
+
+        self.class_info["max_seg_duration"] = max_dur
+        self.class_info["min_seg_duration"] = min_dur
+        self.class_info["total_duration"] = total_dur
+        # logging.info("total_duration", self.class_info["total_duration"])
+
+        # we need the mapping from class index to id
+        self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]]
+        self.map_class_idx_to_ids.set_index("class_idx", inplace=True)
+
+        # we need the list of segments from each class
+        # to speed up segment sampling
+        # searching then in each batch, it is too slow
+        map_class_to_segs = self.seg_set[["id", self.class_name]].set_index(
+            self.class_name
+        )
+        self.map_class_to_segs_idx = {}
+        for class_id in self.class_info["id"].values:
+            if class_id in map_class_to_segs.index:
+                seg_ids = map_class_to_segs.loc[class_id, "id"]
+                if isinstance(seg_ids, str):
+                    seg_ids = [seg_ids]
+                else:
+                    seg_ids = seg_ids.values
+
+                seg_idx = get_loc(self.seg_set,seg_ids)
+            else:
+                seg_idx = []
+                self.class_info.loc[class_id, "weights"] = 0.0
+                self.class_info.renorm_weights()
+
+            self.map_class_to_segs_idx[class_id] = seg_idx
+        logging.info(f'weight_exponent weight:{self.class_info["weights"]}')
+
+
+    def _get_class_weights(self):
+        # if not self.var_weights:
+        # return torch.as_tensor(self.class_info["weights"].values)
+
+        class_weights = self.class_info["weights"].values.copy()
+        # renormalize weights
+        class_weights /= class_weights.sum()
+        return torch.as_tensor(class_weights)
+
+    def _sample_classes(self, num_classes):
+        weights = self._get_class_weights()
+        # logging.info("weights: %s", weights)
+
+        row_idx = torch.multinomial(
+            weights, num_samples=num_classes, replacement=True, generator=self.rng,
+        ).numpy()
+
+        class_ids = self.class_info.iloc[row_idx].id.values
+
+        return class_ids
+
+
+    def _sample_segs(self, class_ids):
+
+        dur_col_idx = self.seg_set.columns.get_loc(self.length_name)
+        id_col_idx = self.seg_set.columns.get_loc("id")
+
+        seg_ids = []
+        for c in class_ids:
+            # for each class we sample segments longer than chunk length
+            # get segments belonging to c
+            # t1 = time.time()
+            seg_idx_c = self.map_class_to_segs_idx[c]
+            # seg_idx_c = self.map_class_to_segs_idx[c]
+            # t2 = time.time()
+            durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values
+            # if self.class_info.loc[c, "min_seg_duration"] < chunk_length:
+            #     mask = durs >= chunk_length
+            #     seg_idx_c = seg_idx_c[mask]
+            #     durs = durs[mask]
+
+            # t3 = time.time()
+            # sample num_segs_per_class random segments
+            if len(seg_idx_c) == 0:
+                logging.error("no segments found with class=%s dur=%d", c, chunk_length)
+            if self.seg_weight_mode == "uniform":
+                sel_idx = torch.randint(
+                    low=0,
+                    high=len(seg_idx_c),
+                    size=(self.num_segs_per_class,),
+                    generator=self.rng,
+                ).numpy()
+
+            elif self.seg_weight_mode == "data-prior":
+                weights = durs / durs.sum()
+                sel_idx = torch.multinomial(
+                    torch.from_numpy(weights),
+                    num_samples=self.num_segs_per_class,
+                    replacement=True,
+                    generator=self.rng,
+                ).numpy()
+                # t4 = time.time()
+            else:
+                raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode)
+
+            sel_seg_idx_c = seg_idx_c[sel_idx]
+            sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx])
+            # t5 = time.time()
+            seg_ids.extend(sel_seg_ids_c)
+            # t6 = time.time()
+            # logging.info(
+            #     "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5
+            # )
+
+        return seg_ids
+
+    def __len__(self):
+        return self._len
+
+    def _shuffle_segs(self):
+        self._permutation = torch.randperm(len(self.seg_set),
+                                           generator=self.rng).numpy()
+
+    def __iter__(self):
+        super().__iter__()
+        if self.shuffle:
+            self._shuffle_segs()
+
+        self.start = self.rank
+        return self
+
+    def __next__(self):
+
+        if self.batch == self._len:
+            raise StopIteration
+
+
+        if self.var_batch_size:
+            column_idx = self.seg_set.columns.get_loc(self.length_name)
+            idxs = []
+            max_length = 0
+            batch_size = 0
+            while True:
+                if self.shuffle:
+                    idx = self._permutation[self.start]
+                else:
+                    idx = self.start
+
+                max_length = max(max_length, self.seg_set.iloc[idx,
+                                                               column_idx])
+                if max_length * (batch_size + 1) > self.max_batch_length:
+                    break
+
+                idxs.append(idx)
+                self.start = (self.start + self.world_size) % len(self.seg_set)
+                batch_size += 1
+                if (self.max_batch_size is not None
+                        and batch_size >= self.max_batch_size):
+                    break
+
+            assert len(
+                idxs
+            ) >= 1, f"increase max_batch_length {self.max_batch_length} >= {max_length}"
+        else:
+            stop = min(self.start + self.world_size * self.min_batch_size,
+                       len(self.seg_set))
+            if self.shuffle:
+                idxs = self._permutation[self.start:stop:self.world_size]
+            else:
+                idxs = slice(self.start, stop, self.world_size)
+
+            self.start += self.world_size * self.min_batch_size
+
+
+        class_ids = self._sample_classes(batch_size)
+        seg_ids = self._sample_segs(class_ids)
+
+
+        # if "chunk_start" in self.seg_set:
+        #     chunks = self.seg_set.iloc[idxs]
+        #     seg_ids = [(id, s, d) for id, s, d in zip(
+        #         chunks.seg_id, chunks.chunk_start, chunks[self.length_name])]
+        # else:
+        #     seg_ids = self.seg_set.iloc[idxs].id.values
+
+        if self.batch == 0:
+            logging.info("batch 0 seg_ids=%s", str(seg_ids[:10]))
+
+        self.batch += 1
+        return seg_ids
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = (
+            "min_batch_size",
+            "max_batch_size",
+            "max_batch_length",
+            "length_name",
+            # "weight_exponent",
+            # "weight_mode",
+            "seg_weight_mode",
+            "num_segs_per_class",
+            "class_name",
+            "shuffle",
+            "drop_last",
+            "seed",
+        )
+
+        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--min-batch-size",
+            type=int,
+            default=1,
+            help=("minimum batch size per gpu"),
+        )
+        parser.add_argument(
+            "--max-batch-size",
+            type=int,
+            default=None,
+            help=
+            ("maximum batch size per gpu, if None, estimated from max_batch_length"
+             ),
+        )
+
+        parser.add_argument(
+            "--max-batch-duration",
+            type=float,
+            default=None,
+            help=
+            ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths"
+             ),
+        )
+
+        parser.add_argument(
+            "--drop-last",
+            action=ActionYesNo,
+            help="drops the last batch of the epoch",
+        )
+
+        parser.add_argument(
+            "--shuffle",
+            action=ActionYesNo,
+            help=
+            "shuffles the segments or chunks at the beginning of the epoch",
+        )
+
+        parser.add_argument(
+            "--seed",
+            type=int,
+            default=1234,
+            help=("seed for sampler random number generator"),
+        )
+
+        parser.add_argument(
+            "--length-name",
+            default="duration",
+            help=
+            "which column in the segment table indicates the duration of the file",
+        )
+
+
+        parser.add_argument(
+            "--weight-exponent",
+            default=1.0,
+            type=float,
+            help=("exponent for class weights"),
+        )
+        parser.add_argument(
+            "--weight-mode",
+            default="custom",
+            choices=["custom", "uniform", "data-prior"],
+            help=("method to get the class weights"),
+        )
+
+        parser.add_argument(
+            "--num-segs-per-class",
+            type=int,
+            default=1,
+            help=("number of segments per class in batch"),
+        )
+        parser.add_argument(
+            "--seg-weight-mode",
+            default="uniform",
+            choices=["uniform", "data-prior"],
+            help=("method to sample segments given a class"),
+        )
+        parser.add_argument(
+            "--class-name",
+            default="class_id",
+            help="which column in the segment table indicates the class of the segment",
+        )
+
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py
index 512f2f64..63b0cc86 100644
--- a/hyperion/torch/data/seg_sampler_factory.py
+++ b/hyperion/torch/data/seg_sampler_factory.py
@@ -13,13 +13,18 @@
 from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler
 from .seg_chunk_sampler import SegChunkSampler
 from .bucketing_seg_sampler import BucketingSegSampler
+from .class_weighted_bucketing_seg_sampler import ClassWeightedRandomBucketingSegSampler
+from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler
+
 
 sampler_dict = {
     "class_weighted_random_seg_chunk_sampler":
     ClassWeightedRandomSegChunkSampler,
     "seg_sampler": SegSampler,
+    "class_weighted_seg_sampler": ClassWeightedRandomSegSampler,
     "seg_chunk_sampler": SegChunkSampler,
     "bucketing_seg_sampler": BucketingSegSampler,
+    "class_weighted_random_bucketing_seg_sampler": ClassWeightedRandomBucketingSegSampler,
 }
 
 
@@ -45,7 +50,7 @@ def create(
         sampler_class = sampler_dict[sampler_type]
         sampler_kwargs = sampler_class.filter_args(**kwargs)
 
-        if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler"]:
+        if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]:
             base_sampler_class = sampler_dict[base_sampler_type]
             base_sampler_kwargs = base_sampler_class.filter_args(**kwargs)
             sampler_kwargs.update(base_sampler_kwargs)
@@ -55,7 +60,9 @@ def create(
                 base_sampler_kwargs = base_sampler_class.filter_args(**kwargs)
                 sampler_kwargs.update(base_sampler_kwargs)
 
-        if sampler_type in ["class_weighted_random_seg_chunk_sampler"]:
+        if sampler_type in ["class_weighted_random_seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]:
+            # import pdb; pdb.set_trace()
+            logging.info(f"sampler-args={sampler_kwargs}")
             try:
                 class_name = sampler_kwargs["class_name"]
             except:
@@ -110,7 +117,7 @@ def add_class_args(parser, prefix=None):
 
         parser.add_argument(
             "--base-sampler-type",
-            choices=["seg_sampler", "bucketing_seg_sampler"],
+            choices=["seg_sampler", "bucketing_seg_sampler", "bucketing_seg_sampler","class_weighted_seg_sampler"],
             default="seg_sampler",
             help=
             "base sampler used for seg_chunk_sampler or bucketing_seg_sampler",
diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py
index 3a65bfde..773402d5 100644
--- a/hyperion/torch/trainers/languageid_trainer.py
+++ b/hyperion/torch/trainers/languageid_trainer.py
@@ -164,45 +164,48 @@ def train_epoch(self, data_loader):
         logs["lr"] = self._get_lr()
         return logs
 
-    # def validation_epoch(self, data_loader, swa_update_bn=False):
-    #     """Validation epoch loop
-
-    #     Args:
-    #       data_loader: PyTorch data loader return input/output pairs.
-    #       sw_update_bn: wheter or not, update batch-norm layers in SWA.
-    #     """
-
-    #     metric_acc = MetricAcc(self.device)
-    #     batch_metrics = ODict()
-    #     with torch.no_grad():
-    #         if swa_update_bn:
-    #             log_tag = "train_"
-    #             self.train()
-    #         else:
-    #             log_tag = "val_"
-    #             self.model.eval()
-
-    #         for batch, (data, audio_length, target) in enumerate(data_loader):
-    #             data, audio_length, target = data.to(
-    #                 self.device), audio_length.to(self.device), target.to(
-    #                     self.device)
-    #             batch_size = data.shape[0]
-    #             # data, target = data.to(self.device), target.to(self.device)
-    #             # batch_size = data.shape[0]
-
-    #             with self.amp_autocast():
-    #                 output, loss = self.model(data,
-    #                                           x_lengths=audio_length,
-    #                                           y=target)
-    #                 # output = self.model(data)
-    #                 # loss = self.loss(output, target)
-
-    #             batch_metrics["loss"] = loss.mean().item()
-    #             for k, metric in self.metrics.items():
-    #                 batch_metrics[k] = metric(output, target)
-
-    #             metric_acc.update(batch_metrics, batch_size)
-
-    #     logs = metric_acc.metrics
-    #     logs = ODict((log_tag + k, v) for k, v in logs.items())
-    #     return logs
+    def validation_epoch(self, data_loader, swa_update_bn=False):
+        """Validation epoch loop
+
+        Args:
+          data_loader: PyTorch data loader return input/output pairs.
+          sw_update_bn: wheter or not, update batch-norm layers in SWA.
+        """
+
+        metric_acc = MetricAcc(self.device)
+        batch_metrics = ODict()
+        with torch.no_grad():
+            if swa_update_bn:
+                log_tag = "train_"
+                self.train()
+            else:
+                log_tag = "val_"
+                self.model.eval()
+
+            for batch, (data, audio_length, target) in enumerate(data_loader):
+                data, audio_length, target = data.to(
+                    self.device), audio_length.to(self.device), target.to(
+                        self.device)
+                batch_size = data.shape[0]
+                # data, target = data.to(self.device), target.to(self.device)
+                # batch_size = data.shape[0]
+
+                with self.amp_autocast():
+                    output = self.model(data, y=target)
+                    loss = self.loss(output, target).mean() / self.grad_acc_steps
+
+                    # output, loss = self.model(data,
+                    #                           x_lengths=audio_length,
+                    #                           y=target)
+                    # output = self.model(data)
+                    # loss = self.loss(output, target)
+
+                batch_metrics["loss"] = loss.mean().item()
+                for k, metric in self.metrics.items():
+                    batch_metrics[k] = metric(output, target)
+
+                metric_acc.update(batch_metrics, batch_size)
+
+        logs = metric_acc.metrics
+        logs = ODict((log_tag + k, v) for k, v in logs.items())
+        return logs

From 07ddda643d93699054961b3d9f351ca745f9757e Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sat, 25 Mar 2023 19:15:51 -0400
Subject: [PATCH 09/89] Remove the seg_weighted_mode for sequence-level task

---
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml | 67 +++++++++++++++++++
 .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml |  4 +-
 .../v1/global_conf/config_lid_v2.1_13langs.sh | 44 ++++++++++++
 egs/commonvoice/v1/run_001_prepare_data.sh    | 10 +--
 egs/commonvoice/v1/run_012_train_lid.sh       | 22 ++----
 hyperion/bin/train_wav2vec2languageid.py      | 15 +----
 hyperion/torch/data/audio_dataset.py          | 15 -----
 .../class_weighted_bucketing_seg_sampler.py   | 13 ----
 .../torch/data/class_weighted_seg_sampler.py  | 60 +++--------------
 9 files changed, 139 insertions(+), 111 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
new file mode 100644
index 00000000..06d5697d
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
@@ -0,0 +1,67 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 2
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 2
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+    data_loader:
+      num_workers: 1
+model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
index 2e7574c2..5ca98bd9 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
@@ -32,9 +32,9 @@ languageid:
     inner_feats: 128
   embed_dim: 192
   cos_scale: 32.0
-  margin: 0.2
+  margin: 0.0
   margin_warmup_epochs: 5
-  intertop_margin: 0.1
+  intertop_margin: 0.0
   dropout_rate: 0.0
 feat_fusion_method: weighted-avg
 feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh
new file mode 100644
index 00000000..c5febd98
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v2.1_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0022.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh
index 7d05ba2c..d4873f0f 100755
--- a/egs/commonvoice/v1/run_001_prepare_data.sh
+++ b/egs/commonvoice/v1/run_001_prepare_data.sh
@@ -37,7 +37,7 @@ if [ ${stage} -le 2 ]; then
   # for part in $test_data $dev_data $nnet_data
   for lan in $lans 
   do
-    for part in ${lan}_test ${lan}_dev ${lan}_train
+    for part in ${lan}_train # ${lan}_test ${lan}_dev 
     do
       echo ${part}
       steps_transducer/preprocess_audios_for_nnet_train.sh --nj 16 --cmd "$train_cmd" \
@@ -59,8 +59,10 @@ if [ ${stage} -le 3 ]; then
     train_folders+="data/${lan}_train_proc_audio "
   done 
   
-  combine_data.sh data/dev_data/ $dev_folders
-  combine_data.sh data/nnet_data/ $train_folders
-
+  combine_data.sh data/${dev_data}/ $dev_folders
+  combine_data.sh data/${nnet_data}/ $train_folders
+  awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_dev_proc_audio/utt2lang data/13_langs_dev_proc_audio/utt2spk > data/13_langs_dev_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv
+  awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_train_proc_audio/utt2lang data/13_langs_train_proc_audio/utt2spk > data/13_langs_train_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv
 
+  # cut -d' ' -f1 --complement  data/${nnet_data}/text > data/lm/${lan}_transcript_words.txt
 fi
\ No newline at end of file
diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh
index 80948243..3b250e16 100755
--- a/egs/commonvoice/v1/run_012_train_lid.sh
+++ b/egs/commonvoice/v1/run_012_train_lid.sh
@@ -8,7 +8,7 @@
 set -e
 
 stage=1
-ngpu=1
+ngpu=2
 config_file=default_config.sh
 interactive=false
 num_workers=""
@@ -49,17 +49,14 @@ if [ $stage -le 1 ]; then
     train_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
-    --data.train.dataset.language-id-file $train_dir/utt2lang \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
-    --data.val.dataset.class-files $val_dir/langs \
-    --data.val.dataset.language-id-file $val_dir/utt2lang \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s1_dir $args \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
@@ -81,17 +78,14 @@ if [ $stage -le 2 ]; then
     finetune_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
-    --data.train.dataset.language-id-file $train_dir/utt2lang \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
-    --data.val.dataset.class-files $val_dir/langs \
-    --data.val.dataset.language-id-file $val_dir/utt2lang \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
     --in-model-file $nnet_s1 \
@@ -115,17 +109,15 @@ if [ $stage -le 3 ]; then
     finetune_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
-    --data.train.dataset.language-id-file $train_dir/utt2lang \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $val_dir/langs \
-    --data.val.dataset.language-id-file $val_dir/utt2lang \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
     --in-model-file $nnet_s2 \
diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py
index 093042f6..de5b2f2d 100755
--- a/hyperion/bin/train_wav2vec2languageid.py
+++ b/hyperion/bin/train_wav2vec2languageid.py
@@ -175,21 +175,10 @@ def make_parser(model_class):
     )
     parser.add_argument("--data.val.dataset.text_file", type=str)
 
-    parser.add_argument("--data.train.dataset.language_id_file", type=str)
-    parser.add_argument("--data.val.dataset.language_id_file", type=str)
 
-
-    parser.add_argument(
-        "--data.train.dataset.class_files",
-        type=str,
+    parser.link_arguments(
+        "data.train.dataset.class_files", "data.val.dataset.class_files"
     )
-
-
-    parser.add_argument(
-        "--data.dev.dataset.class_files",
-        type=str,
-    )
-
     parser.add_argument(
         "--data.train.dataset.class_names",
         type=str,
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 3bfa328b..230b7220 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -467,7 +467,6 @@ def __init__(
         bpe_model=None,
         text_file=None,
         time_durs_file=None,
-        language_id_file=None,
         aug_cfgs=None,
         num_augs=1,
         return_segment_info=None,
@@ -513,12 +512,6 @@ def __init__(
         else:
             assert "duration" in self.seg_set
 
-        if language_id_file is not None:
-            if rank == 0:
-                logging.info("loading language id file %s" % language_id_file)
-
-            language_ids = SegmentSet.load(language_id_file)
-            self.seg_set["language"] = language_ids.loc[self.seg_set["id"]].class_id
 
         logging.info("loading class-info files")
         
@@ -775,7 +768,6 @@ def filter_args(**kwargs):
             "return_segment_info",
             "return_orig",
             "time_durs_file",
-            "language_id_file",
             "target_sample_freq",
         )
         args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
@@ -819,13 +811,6 @@ def add_class_args(parser, prefix=None, skip={}):
             ),
         )
 
-        parser.add_argument(
-            "--language-id-file",
-            default=None,
-            help=(
-                "file with language ids for each utterance"
-            ),
-        )
 
         parser.add_argument(
             "--bpe-model",
diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
index 94943ccc..749d0558 100644
--- a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
+++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
@@ -25,7 +25,6 @@ def __init__(self,
                  length_column="duration",
                  weight_exponent=1.0,
                  weight_mode="custom",
-                 seg_weight_mode="uniform",
                  class_name="language",
                  seed=1234,
                  **base_kwargs):
@@ -40,7 +39,6 @@ def __init__(self,
         self.length_column = length_column
         self.weight_exponent = weight_exponent
         self.weight_mode = weight_mode
-        self.seg_weight_mode = seg_weight_mode
         self._gather_class_info()
         self._set_class_weights()
         self._create_bucket_samplers()
@@ -72,9 +70,6 @@ def _create_bucket_samplers(self):
         for i in range(self.num_buckets):
             sampler_i = self.base_sampler(buckets[i],
                  self.class_info,
-                #  weight_exponent=self.weight_exponent,
-                #  weight_mode=self.weight_mode,
-                 seg_weight_mode=self.seg_weight_mode,
                  class_name=self.class_name, 
                  **self.base_kwargs)
             bucket_samplers.append(sampler_i)
@@ -186,7 +181,6 @@ def filter_args(**kwargs):
             "length_column",
             "weight_exponent",
             "weight_mode",
-            "seg_weight_mode",
             "class_name",
             "length_column",
             "shuffle",
@@ -216,13 +210,6 @@ def add_class_args(parser, prefix=None):
             help=("method to get the class weights"),
         )
 
-        parser.add_argument(
-            "--seg-weight-mode",
-            default="uniform",
-            choices=["uniform", "data-prior"],
-            help=("method to sample segments given a class"),
-        )
-
         parser.add_argument(
             "--shuffle",
             action=ActionYesNo,
diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py
index 09a34591..c56a96a7 100644
--- a/hyperion/torch/data/class_weighted_seg_sampler.py
+++ b/hyperion/torch/data/class_weighted_seg_sampler.py
@@ -6,6 +6,8 @@
 import math
 from jsonargparse import ArgumentParser, ActionParser, ActionYesNo
 import logging
+import copy
+
 
 import numpy as np
 
@@ -36,18 +38,12 @@ def __init__(
         length_name="duration",
         shuffle=False,
         drop_last=False,
-        # weight_exponent=1.0,
-        # weight_mode="custom",
-        seg_weight_mode="uniform",
         num_segs_per_class=1,
         class_name="class_id",
         seed=1234,
     ):
         super().__init__(shuffle=shuffle, seed=seed)
-        self.class_info = class_info
-        # self.weight_exponent=weight_exponent
-        # self.weight_mode=weight_mode
-        self.seg_weight_mode = seg_weight_mode
+        self.class_info = copy.deepcopy(class_info)
         self.num_segs_per_class = num_segs_per_class
         self.class_name=class_name
         self.seg_set = seg_set
@@ -120,6 +116,7 @@ def _gather_class_info(self):
                 seg_idx = get_loc(self.seg_set,seg_ids)
             else:
                 seg_idx = []
+                logging.warning("no segments found with class=%s", class_id)
                 self.class_info.loc[class_id, "weights"] = 0.0
                 self.class_info.renorm_weights()
 
@@ -172,25 +169,13 @@ def _sample_segs(self, class_ids):
             # sample num_segs_per_class random segments
             if len(seg_idx_c) == 0:
                 logging.error("no segments found with class=%s dur=%d", c, chunk_length)
-            if self.seg_weight_mode == "uniform":
-                sel_idx = torch.randint(
-                    low=0,
-                    high=len(seg_idx_c),
-                    size=(self.num_segs_per_class,),
-                    generator=self.rng,
-                ).numpy()
-
-            elif self.seg_weight_mode == "data-prior":
-                weights = durs / durs.sum()
-                sel_idx = torch.multinomial(
-                    torch.from_numpy(weights),
-                    num_samples=self.num_segs_per_class,
-                    replacement=True,
-                    generator=self.rng,
-                ).numpy()
-                # t4 = time.time()
-            else:
-                raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode)
+            
+            sel_idx = torch.randint(
+                low=0,
+                high=len(seg_idx_c),
+                size=(self.num_segs_per_class,),
+                generator=self.rng,
+            ).numpy()
 
             sel_seg_idx_c = seg_idx_c[sel_idx]
             sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx])
@@ -285,9 +270,6 @@ def filter_args(**kwargs):
             "max_batch_size",
             "max_batch_length",
             "length_name",
-            # "weight_exponent",
-            # "weight_mode",
-            "seg_weight_mode",
             "num_segs_per_class",
             "class_name",
             "shuffle",
@@ -354,32 +336,12 @@ def add_class_args(parser, prefix=None):
             "which column in the segment table indicates the duration of the file",
         )
 
-
-        parser.add_argument(
-            "--weight-exponent",
-            default=1.0,
-            type=float,
-            help=("exponent for class weights"),
-        )
-        parser.add_argument(
-            "--weight-mode",
-            default="custom",
-            choices=["custom", "uniform", "data-prior"],
-            help=("method to get the class weights"),
-        )
-
         parser.add_argument(
             "--num-segs-per-class",
             type=int,
             default=1,
             help=("number of segments per class in batch"),
         )
-        parser.add_argument(
-            "--seg-weight-mode",
-            default="uniform",
-            choices=["uniform", "data-prior"],
-            help=("method to sample segments given a class"),
-        )
         parser.add_argument(
             "--class-name",
             default="class_id",

From 396e020276cb55c864b2845836e5713df6daf84b Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login02.rockfish.cluster>
Date: Mon, 27 Mar 2023 00:00:03 -0400
Subject: [PATCH 10/89] Update the LID trainer for merging the new dataloader

---
 egs/commonvoice/v1/run_011_train_asr.sh       | 32 +++++--
 hyperion/bin/train_wav2vec2languageid.py      | 21 ++++-
 hyperion/torch/trainers/__init__.py           |  3 -
 hyperion/torch/trainers/languageid_trainer.py | 91 ++++++++++---------
 4 files changed, 86 insertions(+), 61 deletions(-)

diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh
index 1b402133..e79de7af 100755
--- a/egs/commonvoice/v1/run_011_train_asr.sh
+++ b/egs/commonvoice/v1/run_011_train_asr.sh
@@ -7,8 +7,20 @@
 . ./path.sh
 set -e
 
+# export CUDA_VISIBLE_DEVICES=0
+
+#ml purge
+#module load namd/2.14-cuda-smp
+#module load cuda/11.6.0
+#ml
+#nvidia-smi
+#export CUDA_VISIBLE_DEVICES=0,1,2,3
+#export CONV_RSH=ssh
+#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
+
+
 stage=1
-ngpu=1
+ngpu=2
 config_file=default_config.sh
 interactive=false
 num_workers=""
@@ -47,14 +59,18 @@ if [ $stage -le 1 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s1_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
-    train_wav2vec2transducer.py $nnet_type \
+    train_wav2vec2rnn_transducer.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s1_dir $args \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
@@ -76,11 +92,11 @@ if [ $stage -le 2 ]; then
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
     --in-model-file $nnet_s1 \
@@ -104,11 +120,11 @@ if [ $stage -le 3 ]; then
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
-    --data.train.dataset.segments-file $train_dir/utt2spk \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
-    --data.val.dataset.segments-file $val_dir/utt2spk \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
     --in-model-file $nnet_s2 \
diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py
index de5b2f2d..7af47d03 100755
--- a/hyperion/bin/train_wav2vec2languageid.py
+++ b/hyperion/bin/train_wav2vec2languageid.py
@@ -42,16 +42,27 @@ def Language_collate(batch):
     audio_length = []
     language = []
     for record in batch:
-        wav = torch.as_tensor(record[0])
+        wav = torch.as_tensor(record["x"])
         audio.append(wav)
         audio_length.append(wav.shape[0])
-        language.append(record[1])
-    audio = pad_sequence(audio)
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
     audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+
+    language = [language[k] for k in sort_idx]
     language = torch.as_tensor(language)
-    
-    return torch.transpose(audio, 0, 1), audio_length, language
 
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "language": language,
+    }
+    return batch
 
 def init_data(partition, rank, num_gpus, **kwargs):
     data_kwargs = kwargs["data"][partition]
diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py
index e1f6824f..212f0e92 100644
--- a/hyperion/torch/trainers/__init__.py
+++ b/hyperion/torch/trainers/__init__.py
@@ -5,13 +5,10 @@
 
 from .dvae_trainer import DVAETrainer
 from .torch_trainer import TorchTrainer
-<<<<<<< HEAD
 
 
 
 from .languageid_trainer import LanguageIDTrainer
-=======
->>>>>>> hyp/persephone-asr
 from .transducer_trainer import TransducerTrainer
 from .vae_trainer import VAETrainer
 from .vq_dvae_trainer import VQDVAETrainer
diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py
index 773402d5..0770cb8f 100644
--- a/hyperion/torch/trainers/languageid_trainer.py
+++ b/hyperion/torch/trainers/languageid_trainer.py
@@ -2,18 +2,19 @@
  Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
+import logging
 import os
 from collections import OrderedDict as ODict
 
-import logging
-
 import torch
-import torchaudio
 import torch.nn as nn
+import torchaudio
+from jsonargparse import ActionParser, ArgumentParser
+from torch.distributed.elastic.multiprocessing.errors import record
 
-from ..utils import MetricAcc
+from ...utils.misc import filter_func_args
+from ..utils import MetricAcc, tensors_subset
 from .torch_trainer import TorchTrainer
-from torch.distributed.elastic.multiprocessing.errors import record
 
 
 class LanguageIDTrainer(TorchTrainer):
@@ -75,38 +76,14 @@ def __init__(
         swa_lr=1e-3,
         swa_anneal_epochs=10,
         cpu_offload=False,
+        input_key="x",
+        target_key="language",
     ):
 
         if loss is None:
             loss = nn.CrossEntropyLoss()
-        super().__init__(
-            model,
-            loss,
-            optim,
-            epochs,
-            exp_path,
-            cur_epoch=cur_epoch,
-            grad_acc_steps=grad_acc_steps,
-            eff_batch_size=eff_batch_size,
-            device=device,
-            metrics=metrics,
-            lrsched=lrsched,
-            loggers=loggers,
-            ddp=ddp,
-            ddp_type=ddp_type,
-            train_mode=train_mode,
-            use_amp=use_amp,
-            log_interval=log_interval,
-            use_tensorboard=use_tensorboard,
-            use_wandb=use_wandb,
-            wandb=wandb,
-            grad_clip=grad_clip,
-            grad_clip_norm=grad_clip_norm,
-            swa_start=swa_start,
-            swa_lr=swa_lr,
-            swa_anneal_epochs=swa_anneal_epochs,
-            cpu_offload=cpu_offload,
-        )
+        super_args = filter_func_args(super().__init__, locals())
+        super().__init__(**super_args)
 
     @record
     def train_epoch(self, data_loader):
@@ -115,6 +92,9 @@ def train_epoch(self, data_loader):
         Args:
           data_loader: pytorch data loader returning features and class labels.
         """
+        batch_keys = [
+            self.input_key, f"{self.input_key}_lengths", self.target_key
+        ]
 
         self.model.update_loss_margin(self.cur_epoch)
 
@@ -122,14 +102,14 @@ def train_epoch(self, data_loader):
         batch_metrics = ODict()
         self.model.train()
 
-        for batch, (data, audio_length, target) in enumerate(data_loader):
+        for batch, data in enumerate(data_loader):
             self.loggers.on_batch_begin(batch)
 
             if batch % self.grad_acc_steps == 0:
                 self.optimizer.zero_grad()
-            data, audio_length, target = data.to(self.device), audio_length.to(
-                self.device), target.to(self.device)
-            batch_size = data.shape[0]
+            input_data, input_lengths, target = tensors_subset(
+                data, batch_keys, self.device)
+            batch_size = input_data.shape[0]
 
             with self.amp_autocast():
                 # TODO: Check and Modify output, loss from the model
@@ -137,7 +117,7 @@ def train_epoch(self, data_loader):
                 #                           x_lengths=audio_length,
                 #                           y=target)
                 # loss = loss.mean() / self.grad_acc_steps
-                output = self.model(data, y=target)
+                output = self.model(input_data, y=target)
                 loss = self.loss(output, target).mean() / self.grad_acc_steps
 
             if self.use_amp:
@@ -171,7 +151,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
           data_loader: PyTorch data loader return input/output pairs.
           sw_update_bn: wheter or not, update batch-norm layers in SWA.
         """
-
+        batch_keys = [
+            self.input_key, f"{self.input_key}_lengths", self.target_key
+        ]
         metric_acc = MetricAcc(self.device)
         batch_metrics = ODict()
         with torch.no_grad():
@@ -182,16 +164,15 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
                 log_tag = "val_"
                 self.model.eval()
 
-            for batch, (data, audio_length, target) in enumerate(data_loader):
-                data, audio_length, target = data.to(
-                    self.device), audio_length.to(self.device), target.to(
-                        self.device)
-                batch_size = data.shape[0]
+            for batch, data in enumerate(data_loader):
+                input_data, input_lengths, target = tensors_subset(
+                    data, batch_keys, self.device)
+                batch_size = input_data.shape[0]
                 # data, target = data.to(self.device), target.to(self.device)
                 # batch_size = data.shape[0]
 
                 with self.amp_autocast():
-                    output = self.model(data, y=target)
+                    output = self.model(input_data, y=target)
                     loss = self.loss(output, target).mean() / self.grad_acc_steps
 
                     # output, loss = self.model(data,
@@ -209,3 +190,23 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
         logs = metric_acc.metrics
         logs = ODict((log_tag + k, v) for k, v in logs.items())
         return logs
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        super_skip = skip.copy()
+        super_skip.add("target_key")
+        TorchTrainer.add_class_args(parser,
+                                    train_modes=train_modes,
+                                    skip=super_skip)
+        if "target_key" not in skip:
+            parser.add_argument("--target-key",
+                                default="language",
+                                help="dict. key for nnet targets")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))

From 2ecdebfedacf1d55750c2d4c99c178b5ecdfe727 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login02.rockfish.cluster>
Date: Mon, 27 Mar 2023 02:28:50 -0400
Subject: [PATCH 11/89] add commonvoice config for rnnt transducer

---
 ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml | 83 +++++++++++++++++++
 .../config_pruned_transducer_v1.3_13langs.sh  | 44 ++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
new file mode 100644
index 00000000..3712babc
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
@@ -0,0 +1,83 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 70.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 70.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4200
+    hold_steps: 1500
+    min_lr: 4e-5
+    warmup_steps: 1500
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh
new file mode 100644
index 00000000..575a8436
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3_13_langs_16000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0019.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From d33abe910668f6b710eeab55233c1acadd182ae4 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Fri, 21 Apr 2023 23:34:57 -0400
Subject: [PATCH 12/89] Add fine-tuning code for pruned RNN-T, LID, and Both

---
 ...v2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml |  86 ++++
 ...v2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml |  86 ++++
 ...v2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml |  86 ++++
 ...v2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml |  86 ++++
 ...v2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml |  73 +++
 ...v2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml |  71 +++
 ...v2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml |  73 +++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml |   6 +-
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml |  69 +++
 ...c2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml |  71 +++
 ...wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml |  41 ++
 .../v1/global_conf/config_lid_v2.2_13langs.sh |  44 ++
 .../config_pruned_transducer_v3.0_13langs.sh  |  44 ++
 .../config_pruned_transducer_v3.1_13langs.sh  |  44 ++
 .../config_pruned_transducer_v3.2_13langs.sh  |  44 ++
 .../config_pruned_transducer_v4.0_13langs.sh  |  44 ++
 hyperion/bin/finetune_wav2vec2languageid.py   | 267 +++++++++++
 .../bin/finetune_wav2vec2rnn_transducer.py    | 248 ++++++++++
 .../finetune_wav2vec2transducer_languageid.py | 255 +++++++++++
 hyperion/np/augment/noise_augment.py          |   2 +-
 .../class_weighted_bucketing_seg_sampler.py   |  27 ++
 .../torch/data/class_weighted_seg_sampler.py  |  14 +-
 hyperion/torch/data/seg_sampler_factory.py    |  10 +
 hyperion/torch/models/__init__.py             |   1 +
 .../torch/models/transducer/rnn_transducer.py |   4 +-
 .../wav2transducer_languageid/__init__.py     |   7 +
 .../hf_wav2rnn_transducer_languageid.py       | 428 ++++++++++++++++++
 .../hf_wav2vec2rnn_transducer_languageid.py   | 119 +++++
 hyperion/torch/trainers/__init__.py           |   1 +
 .../trainers/transducer_languageid_trainer.py | 222 +++++++++
 30 files changed, 2566 insertions(+), 7 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
 create mode 100755 hyperion/bin/finetune_wav2vec2languageid.py
 create mode 100755 hyperion/bin/finetune_wav2vec2rnn_transducer.py
 create mode 100755 hyperion/bin/finetune_wav2vec2transducer_languageid.py
 create mode 100644 hyperion/torch/models/wav2transducer_languageid/__init__.py
 create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
 create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
 create mode 100644 hyperion/torch/trainers/transducer_languageid_trainer.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml
new file mode 100644
index 00000000..4718389d
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml
@@ -0,0 +1,86 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 80
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 1.0
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 80
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 1.0
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.2
+        rnn_dropout_rate: 0.2
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
new file mode 100644
index 00000000..f41f8dad
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
@@ -0,0 +1,86 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml
new file mode 100644
index 00000000..fbadc196
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml
@@ -0,0 +1,86 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.2
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 10
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
new file mode 100644
index 00000000..f41f8dad
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
@@ -0,0 +1,86 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml
new file mode 100644
index 00000000..9db63d77
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  transducer:
+    decoder:
+      prune_range: 15
+      override_dropouts: false
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml
new file mode 100644
index 00000000..85970fa6
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.2
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  transducer:
+    decoder:
+      override_dropouts: false
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
new file mode 100644
index 00000000..9db63d77
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  transducer:
+    decoder:
+      prune_range: 15
+      override_dropouts: false
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
index 06d5697d..0bb34b23 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml
@@ -17,10 +17,11 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
 
 
     data_loader:
-      num_workers: 1
+      num_workers: 4
   val:
     dataset:
       aug_cfgs: 
@@ -39,8 +40,9 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
     data_loader:
-      num_workers: 1
+      num_workers: 4
 model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
new file mode 100644
index 00000000..77cd2d26
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 85.
+      min_batch_size: 2
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 30
+      min_batch_size: 2
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 420000
+    hold_steps: 300000
+    min_lr: 4e-5
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
new file mode 100644
index 00000000..c73c7130
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model:
+  languageid:
+    cos_scale: 32.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 420000
+    hold_steps: 300000
+    min_lr: 4e-5
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
new file mode 100644
index 00000000..7d6d9473
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
@@ -0,0 +1,41 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.0
+  margin_warmup_epochs: 5
+  intertop_margin: 0.0
+  dropout_rate: 0.2
+  
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
new file mode 100644
index 00000000..debd9377
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v2.2_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0014.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v2.2_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh
new file mode 100644
index 00000000..0f66c12a
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.0_13_langs_4000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0019.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh
new file mode 100644
index 00000000..3fb2f93a
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.1_13_langs_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0010.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh
new file mode 100644
index 00000000..4a990e2c
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.2_13_langs_16000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0001.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
new file mode 100644
index 00000000..29a762fa
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/bin/finetune_wav2vec2languageid.py b/hyperion/bin/finetune_wav2vec2languageid.py
new file mode 100755
index 00000000..4ac24e98
--- /dev/null
+++ b/hyperion/bin/finetune_wav2vec2languageid.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import sys
+import os
+from pathlib import Path
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+import multiprocessing
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.utils import ddp
+from hyperion.torch.trainers import LanguageIDTrainer as Trainer
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID
+from torch.nn.utils.rnn import pad_sequence
+
+model_dict = {
+    "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID,
+    # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID,
+    # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID,
+}
+
+
+def Language_collate(batch):
+    audio = []
+    audio_length = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "language": language,
+    }
+    return batch
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = ({
+        "num_workers": num_workers_per_gpu,
+        "pin_memory": True
+    } if num_gpus > 0 else {})
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=sampler,
+                                              **largs,
+                                              collate_fn=Language_collate)
+    return data_loader
+
+
+def init_model(num_classes, in_model_file, rank, model_class, **kwargs):
+    model_args = model_class.filter_finetune_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network ft args={}".format(model_args))
+
+    model_args["languageid"]["num_classes"] = num_classes
+    model = TML.load(in_model_file)
+    logging.info(model_args)
+    model.change_config(**model_args)
+
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+
+    model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train",
+                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str,
+    )
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
+
+    parser.link_arguments(
+        "data.train.dataset.class_files", "data.val.dataset.class_files"
+    )
+    parser.add_argument(
+        "--data.train.dataset.class_names",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--data.dev.dataset.class_names",
+        type=str,
+    )
+
+
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str,
+    )
+
+    parser.link_arguments("data.train.data_loader.num_workers",
+                          "data.val.data_loader.num_workers")
+
+    parser.add_argument("--in-model-file", required=True)
+    model_class.add_finetune_args(parser, prefix="model")
+    Trainer.add_class_args(parser,
+                           prefix="trainer",
+                           train_modes=model_class.valid_train_modes())
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1123581321,
+                        help="random seed")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Train Wav2Vec2Language model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/bin/finetune_wav2vec2rnn_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_transducer.py
new file mode 100755
index 00000000..4092ecd7
--- /dev/null
+++ b/hyperion/bin/finetune_wav2vec2rnn_transducer.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer)
+from hyperion.torch.trainers import TransducerTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+model_dict = {
+    "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
+    "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer,
+    # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer,
+    # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer,
+    # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer,
+    # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer,
+}
+
+
+def transducer_collate(batch):
+    audio = []
+    audio_length = []
+    target = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        target.append(record["text"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": target,
+    }
+    return batch
+
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate)
+    return data_loader
+
+
+def init_model(in_model_file, rank, model_class, **kwargs):
+    model_args = model_class.filter_finetune_args(**kwargs["model"])
+    # model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network ft args={}".format(model_args))
+    model = TML.load(in_model_file)
+    model.change_config(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    model = init_model(**kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {} 
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+    
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str, 
+    )
+    
+    parser.add_argument("--data.val.dataset.text_file", type=str) 
+    
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str, 
+    )
+
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
+
+    parser.link_arguments(
+        "data.train.dataset.bpe_model", "data.val.dataset.bpe_model"
+    )
+
+
+    parser.add_argument("--in-model-file", required=True)
+    model_class.add_finetune_args(parser, prefix="model")
+    # model_class.add_class_args(parser, prefix="model")
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Fine-tune  Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2transducer_languageid.py
new file mode 100755
index 00000000..0628f3da
--- /dev/null
+++ b/hyperion/bin/finetune_wav2vec2transducer_languageid.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer,
+                                   HFWav2Vec2RNNTransducerResnet1D)
+from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+model_dict = {
+    "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D,
+    
+}
+
+
+def transducer_language_collate(batch):
+    audio = []
+    audio_length = []
+    text = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        text.append(record["text"])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    text = [text[k] for k in sort_idx]
+    text = k2.RaggedTensor(text)
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": text,
+        "languageid": language,
+    }
+    return batch
+
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_language_collate)
+    return data_loader
+
+
+def init_model(num_classes, in_model_transducer, in_model_lid, rank, model_class, **kwargs):
+    model_args = model_class.filter_finetune_args(**kwargs["model"])
+    # model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network ft args={}".format(model_args))
+    model_wav2transducer = TML.load(in_model_transducer)
+    model_wav2lid = TML.load(in_model_lid)
+    model_args["languageid"]["num_classes"] = num_classes
+    logging.info(model_args)
+    model = model_class(model_wav2transducer.hf_feats, model_wav2transducer.transducer, model_wav2lid.languageid)
+    model.change_config(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {} 
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+    
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str, 
+    )
+    
+    parser.add_argument("--data.val.dataset.text_file", type=str) 
+    
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str, 
+    )
+
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
+
+    parser.link_arguments(
+        "data.train.dataset.bpe_model", "data.val.dataset.bpe_model"
+    )
+
+
+    parser.add_argument("--in-model-transducer", required=True)
+    parser.add_argument("--in-model-lid", required=True)
+    model_class.add_finetune_args(parser, prefix="model")
+    # model_class.add_class_args(parser, prefix="model")
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Fine-tune  Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py
index 799db930..fe54f385 100644
--- a/hyperion/np/augment/noise_augment.py
+++ b/hyperion/np/augment/noise_augment.py
@@ -55,7 +55,7 @@ def __init__(
     @staticmethod
     def _power(x):
         """Computes power of x in dB."""
-        return 10 * np.log10((x ** 2).sum())
+        return 10 * np.log10(((x+1e-5) ** 2).sum())
 
     @staticmethod
     def snr(x, n):
diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
index 749d0558..1509d446 100644
--- a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
+++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py
@@ -23,9 +23,11 @@ def __init__(self,
                  base_sampler=ClassWeightedRandomSegSampler,
                  num_buckets=10,
                  length_column="duration",
+                 num_chunks_per_seg_epoch=1.0,
                  weight_exponent=1.0,
                  weight_mode="custom",
                  class_name="language",
+                 max_audio_length=None,
                  seed=1234,
                  **base_kwargs):
         super().__init__(shuffle=False, seed=seed)
@@ -37,7 +39,9 @@ def __init__(self,
         self.base_kwargs["seed"] = seed
         self.num_buckets = num_buckets
         self.length_column = length_column
+        self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch
         self.weight_exponent = weight_exponent
+        self.max_audio_length = max_audio_length
         self.weight_mode = weight_mode
         self._gather_class_info()
         self._set_class_weights()
@@ -49,6 +53,10 @@ def create_buckets(self):
         # class_ids = self._sample_classes()
         sort_idx = np.argsort(self.seg_set[self.length_column].values)
         sorted_seg_set = self.seg_set.iloc[sort_idx]
+        # import pdb; pdb.set_trace()
+        # remove audio length larger than max_audio_length
+        if self.max_audio_length is not None:
+            sorted_seg_set = sorted_seg_set.loc[sorted_seg_set[self.length_column] <= self.max_audio_length]
         cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values,
                                 axis=0)
         bucket_length = cum_lengths[-1] / self.num_buckets
@@ -71,6 +79,7 @@ def _create_bucket_samplers(self):
             sampler_i = self.base_sampler(buckets[i],
                  self.class_info,
                  class_name=self.class_name, 
+                 num_chunks_per_seg_epoch=self.num_chunks_per_seg_epoch,
                  **self.base_kwargs)
             bucket_samplers.append(sampler_i)
 
@@ -179,8 +188,10 @@ def filter_args(**kwargs):
         valid_args = (
             "num_buckets",
             "length_column",
+            "num_chunks_per_seg_epoch",
             "weight_exponent",
             "weight_mode",
+            "max_audio_length",
             "class_name",
             "length_column",
             "shuffle",
@@ -197,12 +208,28 @@ def add_class_args(parser, prefix=None):
             parser = ArgumentParser(prog="")
 
 
+        parser.add_argument(
+            "--num-chunks-per-seg-epoch",
+            default=1,
+            type=lambda x: x if x == "auto" else float(x),
+            help=("number of times we sample a segment in each epoch"),
+        )
+
         parser.add_argument(
             "--weight-exponent",
             default=1.0,
             type=float,
             help=("exponent for class weights"),
         )
+
+
+        parser.add_argument(
+            "--max-audio-length",
+            default=None,
+            type=float,
+            help=("the maximum length of an audio segment in seconds"),
+        )
+
         parser.add_argument(
             "--weight-mode",
             default="custom",
diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py
index c56a96a7..5af8cdcc 100644
--- a/hyperion/torch/data/class_weighted_seg_sampler.py
+++ b/hyperion/torch/data/class_weighted_seg_sampler.py
@@ -35,6 +35,7 @@ def __init__(
         min_batch_size=1,
         max_batch_size=None,
         max_batch_length=None,
+        num_chunks_per_seg_epoch=1,
         length_name="duration",
         shuffle=False,
         drop_last=False,
@@ -46,6 +47,7 @@ def __init__(
         self.class_info = copy.deepcopy(class_info)
         self.num_segs_per_class = num_segs_per_class
         self.class_name=class_name
+        self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch
         self.seg_set = seg_set
         self.min_batch_size = min_batch_size
         self.max_batch_size = max_batch_size
@@ -62,11 +64,11 @@ def __init__(
 
         if drop_last:
             self._len = int(
-                len(self.seg_set) / (avg_batch_size * self.world_size))
+                self.num_chunks_per_seg_epoch * len(self.seg_set) / (avg_batch_size * self.world_size))
         else:
             self._len = int(
                 math.ceil(
-                    (len(self.seg_set) // self.world_size) / avg_batch_size))
+                    (self.num_chunks_per_seg_epoch * len(self.seg_set) // self.world_size) / avg_batch_size))
 
         self._gather_class_info()
         self._permutation = None
@@ -271,6 +273,7 @@ def filter_args(**kwargs):
             "max_batch_length",
             "length_name",
             "num_segs_per_class",
+            "num_chunks_per_seg_epoch",
             "class_name",
             "shuffle",
             "drop_last",
@@ -336,6 +339,13 @@ def add_class_args(parser, prefix=None):
             "which column in the segment table indicates the duration of the file",
         )
 
+        parser.add_argument(
+            "--num-chunks-per-seg-epoch",
+            default=1,
+            type=lambda x: x if x == "auto" else float(x),
+            help=("number of times we sample a segment in each epoch"),
+        )
+
         parser.add_argument(
             "--num-segs-per-class",
             type=int,
diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py
index f2fb1914..0a9a8a69 100644
--- a/hyperion/torch/data/seg_sampler_factory.py
+++ b/hyperion/torch/data/seg_sampler_factory.py
@@ -86,6 +86,7 @@ def filter_args(**kwargs):
             "min_batch_size",
             "max_batch_size",
             "max_batch_length",
+            "max_audio_length",
             "num_chunks_per_seg_epoch",
             "num_segs_per_class",
             "num_chunks_per_seg",
@@ -154,6 +155,15 @@ def add_class_args(parser, prefix=None):
              ),
         )
 
+
+        parser.add_argument(
+            "--max-audio-length",
+            default=None,
+            type=float,
+            help=("the maximum length of an audio segment in seconds"),
+        )
+
+
         parser.add_argument(
             "--batch-size",
             default=None,
diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py
index 591bbb97..a8bb24d5 100644
--- a/hyperion/torch/models/__init__.py
+++ b/hyperion/torch/models/__init__.py
@@ -12,6 +12,7 @@
                              HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer)
 from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector,
                            HFWavLM2ResNet1dXVector)
+from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
 from .xvectors.efficient_net_xvector import EfficientNetXVector
 from .xvectors.resnet1d_xvector import ResNet1dXVector
 from .xvectors.resnet_xvector import ResNetXVector
diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py
index 0b886fdf..5b8bc3ec 100644
--- a/hyperion/torch/models/transducer/rnn_transducer.py
+++ b/hyperion/torch/models/transducer/rnn_transducer.py
@@ -61,7 +61,7 @@ def forward(
         self,
         x: torch.Tensor,
         x_lengths: torch.Tensor,
-        y: k2.RaggedTensor,
+        y: Union[Dict, k2.RaggedTensor],
     ) -> RNNTransducerOutput:
         """
         Args:
@@ -199,7 +199,7 @@ def change_config(
     @staticmethod
     def filter_finetune_args(**kwargs):
         args = {}
-        decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"])
+        decoder_args = RNNTransducerDecoder.filter_finetune_args(**kwargs["decoder"])
         args["decoder"] = decoder_args
         return args
 
diff --git a/hyperion/torch/models/wav2transducer_languageid/__init__.py b/hyperion/torch/models/wav2transducer_languageid/__init__.py
new file mode 100644
index 00000000..98ebfdc7
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer_languageid/__init__.py
@@ -0,0 +1,7 @@
+"""
+ Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""
+
+from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
\ No newline at end of file
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
new file mode 100644
index 00000000..b710655e
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -0,0 +1,428 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import contextlib
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ....utils import HypDataClass
+from ...torch_model import TorchModel
+from ...utils import remove_silence
+from ..transducer import RNNTransducer, RNNTransducerOutput
+
+@dataclass
+class RNNTransducerLanguageIDOutput(HypDataClass):
+    loss: torch.Tensor
+    loss_transducer: torch.Tensor
+    loss_lid: torch.Tensor
+    loss_transducer_simple: Optional[torch.Tensor] = None
+    loss_transducer_pruned: Optional[torch.Tensor] = None
+    h_feats: Optional[List[torch.Tensor]] = None
+
+class HFWav2RNNTransducerLanguageID(TorchModel):
+    """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor.
+
+    Attributes:
+       hf_feats: hugging face model wrapper object.
+       transducer: transducer model object.
+       languageid: language identification model object.
+       feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to
+                          the wav2vec "num_layers".
+       feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(self,
+                 hf_feats: TorchModel,
+                 transducer: Union[Dict, TorchModel],
+                 languageid: Union[Dict, TorchModel],
+                 feat_fusion_start: int = 0,
+                 feat_fusion_method: str = "weighted-avg",
+                 loss_weight_transducer: float = 0.005,
+                 loss_weight_lid: float = 1.0,):
+
+        super().__init__()
+        self.hf_feats = hf_feats
+        if isinstance(transducer, dict):
+            transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+            #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in transducer:
+                del transducer["class_name"]
+
+            transducer["encoder"] = None
+            transducer = RNNTransducer(**transducer)
+        else:
+            assert isinstance(transducer, RNNTransducer)
+            if transducer.encoder is None:
+                assert transducer.decoder.in_feats == hf_feats.hidden_size
+                #assert transducer.joiner.in_feats == hf_feats.hidden_size
+
+        self.transducer = transducer
+        self.languageid = languageid
+        self.feat_fusion_start = feat_fusion_start
+        self.feat_fusion_method = feat_fusion_method
+        self.loss_weight_transducer = loss_weight_transducer
+        self.loss_weight_lid = loss_weight_lid
+        self._hf_context = contextlib.nullcontext()
+        self._make_fuser()
+
+    def _make_fuser(self):
+        if self.feat_fusion_method == "last":
+            self.feat_fuser = None
+            return
+
+        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        layer_dim = self.hf_feats.hidden_size
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif self.feat_fusion_method == "linear":
+            self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
+            self.feat_fuser.weight.data = torch.ones(1,
+                                                     num_layers) / num_layers
+        elif self.feat_fusion_method == "cat":
+            self.feat_fuser = nn.Linear(num_layers * layer_dim,
+                                        layer_dim,
+                                        bias=False)
+
+    def _fuse_hid_feats(self, hid_feats):
+        """Fuses the hidden features from the Wav2Vec model.
+
+        Args:
+          hid_feats: list of hidden features Tensors from Wav2Vec model.
+
+        Returns:
+          Tensor of fused features (batch, channels, time)
+        """
+        if len(hid_feats) == 1:
+            # There is only one layer of features
+            return hid_feats[0]
+
+        hid_feats = hid_feats[self.feat_fusion_start:]
+        if self.feat_fusion_method == "weighted-avg":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method == "linear":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method == "cat":
+            hid_feats = torch.cat(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats)
+        elif self.feat_fusion_method == "last":
+            feats = hid_feats[-1]
+
+        return feats
+
+    def forward_feats(self,
+                      x,
+                      x_lengths,
+                      return_feat_layers=None,
+                      chunk_length=0,
+                      detach_chunks=False):
+        return_hid_states = (False if return_feat_layers is None
+                             and self.feat_fusion_method == "last" else True)
+        with self._hf_context:
+            hf_output = self.hf_feats(
+                x,
+                x_lengths,
+                return_hid_states=return_hid_states,
+                chunk_length=chunk_length,
+                detach_chunks=detach_chunks,
+            )
+        feat_lengths = hf_output["hidden_states_lengths"]
+        if return_hid_states:
+            hid_feats = hf_output["hidden_states"]
+            feats = self._fuse_hid_feats(hid_feats)
+        else:
+            hid_feats = None
+            feats = hf_output["last_hidden_state"]
+
+        feats = feats.transpose(1, 2)
+        if return_feat_layers is not None:
+            # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time)
+            # as the hidden features of the x-vector encoder.
+            hid_feats = [
+                f.transpose(1, 2) for i, f in enumerate(hid_feats)
+                if i in return_feat_layers
+            ]
+        else:
+            hid_feats = None
+
+        return feats, hid_feats, feat_lengths
+
+    def forward(
+        self,
+        x,
+        x_lengths=None,
+        text=None,
+        languageid=None,
+        return_feat_layers=None,
+        return_enc_layers=None,
+        return_classif_layers=None,
+        return_logits=True,
+    ):
+        """Forward function. If returns the logits posteriors of the classes.
+        It can also returns the hidden representations in the wav2vec feature extractor,
+        the x-vector encoder and the
+        classification head. In this case the ouput variable is a dictionary.
+
+        Args:
+          x: input features tensor with shape=(batch, in_feats, time)
+          x_lengths: time lengths of the features with shape=(batch,)
+          y: target classes torch.long tensor with shape=(batch,)
+          return_feat_layers: list of integers indicating, which wav2vec layers
+                             we should return. If None, no wav2vec layers are returned.
+          return_enc_layers: list of integers indicating, which encoder layers
+                             we should return. If None, no encoder layers are returned.
+          return_logits: if True, it adds the logits to the output dictionary.
+        Returns:
+          Dataclass with losses, "h_enc" (list of hidden encoder layers),
+          "h_feats" (wav2vec features)
+        """
+        feats, hid_feats, feat_lengths = self.forward_feats(
+            x, x_lengths, return_feat_layers)
+            
+
+
+        logits = self.languageid(
+            feats,
+            None,
+            languageid,
+            return_enc_layers=return_enc_layers,
+            return_classif_layers=return_classif_layers,
+            return_logits=return_logits,
+        )
+
+        loss_lid = nn.CrossEntropyLoss()(logits, languageid)
+
+
+
+        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+        trans_output = self.transducer(
+            feats,
+            feat_lengths,
+            text,
+        )
+
+
+
+        if return_feat_layers:
+            trans_output.h_feats = hid_feats
+        output = RNNTransducerLanguageIDOutput( self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats)
+        return output
+
+    def infer(self,
+              x: torch.Tensor,
+              x_lengths: torch.Tensor,
+              decoding_method="time_sync_beam_search",
+              beam_width: int = 5,
+              max_sym_per_frame: int = 3,
+              max_sym_per_utt: int = 1000):
+        """
+        ASR tokens inference
+        Args:
+          x: input features with shape = (N, T, C)
+          x_lengths: feature number for frames with shape = (N,)
+          decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
+          max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
+          max_sym_per_utt: maximimum number of symbols in a single utterance.
+        Returns:
+          List of list of integer indexes of the recognizer's symbols.
+        """
+
+        feats, _, feat_lengths = self.forward_feats(x, x_lengths)
+
+        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+
+        y = self.transducer.infer(feats,
+                                  feat_lengths,
+                                  decoding_method=decoding_method,
+                                  beam_width=beam_width,
+                                  max_sym_per_frame=max_sym_per_frame,
+                                  max_sym_per_utt=max_sym_per_utt)
+        return y
+
+    def freeze_feat_fuser(self):
+        if self.feat_fuser is None:
+            return
+
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser.requires_grad = False
+            return
+
+        for param in self.feat_fuser.parameters():
+            param.requires_grad = False
+
+    def freeze_hf_feats(self):
+        self.hf_feats.freeze()
+
+    def freeze_hf_feature_encoder(self):
+        self.hf_feats.freeze_feature_encoder()
+
+    def set_train_mode(self, mode):
+        if mode == self._train_mode:
+            return
+
+        if mode == "full":
+            self.unfreeze()
+        elif mode == "frozen":
+            self.freeze()
+        elif mode in ["ft-transducer", "ft-transducer-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+            self.freeze_feat_fuser()
+        elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+        elif mode == "hf-feat-extractor-frozen":
+            self.unfreeze()
+            self.freeze_hf_feature_encoder()
+        else:
+            raise ValueError(f"invalid train_mode={mode}")
+
+        logging.info("train mode set to %s", mode)
+
+        if "nograd" in mode:
+            logging.info("using torch.no_grad for hf_feats")
+            self._hf_context = torch.no_grad()
+        else:
+            self._hf_context = contextlib.nullcontext()
+
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+
+        if train_mode in ["full", "frozen"]:
+            super()._train(train_mode)
+        elif train_mode in [
+                "ft-transducer",
+                "hf-feats-frozen",
+                "ft-transducer-nograd",
+                "hf-feats-frozen-nograd",
+                "hf-feat-extractor-frozen",
+        ]:
+            self.hf_feats.train()
+            self.transducer._train("full")
+        else:
+            raise ValueError(f"invalid train_mode={train_mode}")
+
+    @staticmethod
+    def valid_train_modes():
+        return [
+            "full",
+            "frozen",
+            "ft-embed-affine",
+            "ft-transducer",
+            "hf-feats-frozen",
+            "ft-transducer-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
+        ]
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = (
+            "hf_feats",
+            "transducer",
+            "feat_fusion_start",
+            "feat_fusion_method",
+            "loss_weight_transducer",
+            "loss_weight_lid",
+        )
+        args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+        return args
+
+    def get_config(self):
+        hf_cfg = self.hf_feats.get_config()
+        tran_cfg = self.transducer.get_config()
+        del hf_cfg["class_name"]
+        del tran_cfg["class_name"]
+        config = {
+            "hf_feats": hf_cfg,
+            "transducer": tran_cfg,
+            "feat_fusion_start": self.feat_fusion_start,
+            "feat_fusion_method": self.feat_fusion_method,
+            "loss_weight_transducer": self.loss_weight_transducer,
+            "loss_weight_lid": self.loss_weight_lid,
+        }
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def change_config(self, hf_feats, transducer, languageid):
+        logging.info("changing hf wav2transducer config")
+        self.hf_feats.change_config(**hf_feats)
+        self.transducer.change_config(**transducer)
+        self.languageid.change_config(**languageid)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--feat-fusion-start",
+            default=0,
+            type=int,
+            help="""
+            the input to x-vector model will fuse the wav2vec 
+            layers from feat_fusion_start to
+            the wav2vec num_layers""",
+        )
+        parser.add_argument(
+            "--feat-fusion-method",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=("method to fuse the hidden layers from the wav2vec model "
+                  "in [weighted-avg, linear, cat, last]"),
+        )
+
+        parser.add_argument(
+            "--loss-weight-transducer",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the transducer loss
+            """,
+        )
+
+        parser.add_argument(
+            "--loss-weight-lid",
+            default=1.0,
+            type=float,
+            help="""
+            The weight of the lid loss
+            """,
+        )
+
+
+
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix,
+                action=ActionParser(parser=parser),
+            )
+
+    @staticmethod
+    def add_infer_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        RNNTransducer.add_infer_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_infer_args(**kwargs):
+        return RNNTransducer.filter_infer_args(**kwargs)
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
new file mode 100644
index 00000000..4fa19144
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
@@ -0,0 +1,119 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ...tpm import HFWav2Vec2
+from ..transducer import RNNTransducer
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
+from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID
+from ..wav2transducer import HFWav2Vec2RNNTransducer
+
+
+from .hf_wav2rnn_transducer_languageid import HFWav2RNNTransducerLanguageID
+
+
+class HFWav2Vec2RNNTransducerResnet1D(HFWav2RNNTransducerLanguageID):
+    """Class for RNN-T with Wav2Vec2 features
+
+    Attributes:
+      Attributes:
+      hf_feats: HFWav2Vec configuration dictionary or object.
+                This is a warpper over Hugging Face Wav2Vec model.
+      transducer: Transducer configuration dictionary or object.
+      feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to
+                         the wav2vec "num_layers".
+      feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(
+        self,
+        hf_feats: Union[Dict, HFWav2Vec2],
+        transducer: Union[Dict, RNNTransducer],
+        languageid: Union[Dict, ResNet1dLanguageID],
+        feat_fusion_start: int = 0,
+        feat_fusion_method: str = "weighted-avg",
+    ):
+
+        # if isinstance(hf_feats, dict):
+        #     if "class_name" in hf_feats:
+        #         del hf_feats["class_name"]
+        #     hf_feats = HFWav2Vec2(**hf_feats)
+        # else:
+        #     assert isinstance(hf_feats, HFWav2Vec2)
+
+        # if isinstance(languageid, dict):
+        #     languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
+        #     if "class_name" in languageid:
+        #         del languageid["class_name"]
+        #     languageid = ResNet1dLanguageID(**languageid)
+        # else:
+        #     assert isinstance(languageid, ResNet1dLanguageID)
+        #     assert languageid.encoder_net.in_feats == hf_feats.hidden_size
+
+        # hf_feats = wav2transducer.hf_feats
+        # transducer = wav2transducer.transducer
+        # languageid = wav2languageid.languageid
+
+
+        super().__init__(hf_feats, transducer, languageid, feat_fusion_start,
+                         feat_fusion_method)
+
+    @staticmethod
+    def filter_args(**kwargs):
+        base_args = HFWav2RNNTransducerLanguageID.filter_args(**kwargs)
+        child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNTransducer.filter_args(**kwargs["transducer"])
+        child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"])
+        base_args["transducer"] = child_args
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_class_args(parser, prefix="hf_feats")
+        RNNTransducer.add_class_args(parser, prefix="transducer")
+        # HFWav2RNNTransducer.add_class_args(parser)
+        ResNet1dLanguageID.add_class_args(parser, prefix="languageid")
+        # HFWav2LanguageID.add_class_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+        child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"])
+        base_args["transducer"] = child_args
+        child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"])
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats")
+        RNNTransducer.add_finetune_args(parser, prefix="transducer")
+        ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py
index 212f0e92..3c96c84f 100644
--- a/hyperion/torch/trainers/__init__.py
+++ b/hyperion/torch/trainers/__init__.py
@@ -10,6 +10,7 @@
 
 from .languageid_trainer import LanguageIDTrainer
 from .transducer_trainer import TransducerTrainer
+from .transducer_languageid_trainer import TransducerLanguageIDTrainer
 from .vae_trainer import VAETrainer
 from .vq_dvae_trainer import VQDVAETrainer
 from .vq_vae_trainer import VQVAETrainer
diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py
new file mode 100644
index 00000000..238e8022
--- /dev/null
+++ b/hyperion/torch/trainers/transducer_languageid_trainer.py
@@ -0,0 +1,222 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import os
+from collections import OrderedDict as ODict
+
+import torch
+import torch.nn as nn
+import torchaudio
+from jsonargparse import ActionParser, ArgumentParser
+from torch.distributed.elastic.multiprocessing.errors import record
+
+from ...utils.misc import filter_func_args
+from ..utils import MetricAcc, tensors_subset
+from .torch_trainer import TorchTrainer
+
+
+class TransducerLanguageIDTrainer(TorchTrainer):
+    """Trainer to train ASR style models.
+
+    Attributes:
+      model: ASR model object.
+      optim: pytorch optimizer object or options dict
+      epochs: max. number of epochs
+      exp_path: experiment output path
+      cur_epoch: current epoch
+      grad_acc_steps: gradient accumulation steps to simulate larger batch size.
+      device: cpu/gpu device
+      metrics: extra metrics to compute besides cxe.
+      lrsched: learning rate scheduler object or options dict
+      loggers: LoggerList object, loggers write training progress to std. output and file.
+               If None, it uses default loggers.
+      ddp: if True use distributed data parallel training
+      ddp_type: type of distributed data parallel in  (ddp, oss_ddp, oss_shared_ddp)
+      loss: if None, it uses cross-entropy
+      train_mode: training mode in ['train', 'ft-full', 'ft-last-layer']
+      use_amp: uses mixed precision training.
+      log_interval: number of optim. steps between log outputs
+      use_tensorboard: use tensorboard logger
+      use_wandb: use wandb logger
+      wandb: wandb dictionary of options
+      grad_clip: norm to clip gradients, if 0 there is no clipping
+      grad_clip_norm: norm type to clip gradients
+      swa_start: epoch to start doing swa
+      swa_lr: SWA learning rate
+      swa_anneal_epochs: SWA learning rate anneal epochs
+      cpu_offload: CPU offload of gradients when using fully sharded ddp
+    """
+
+    def __init__(
+        self,
+        model,
+        optim={},
+        epochs=100,
+        exp_path="./train",
+        cur_epoch=0,
+        grad_acc_steps=1,
+        eff_batch_size=None,
+        device=None,
+        metrics=None,
+        lrsched=None,
+        loggers=None,
+        ddp=False,
+        ddp_type="ddp",
+        loss=None,
+        train_mode="full",
+        use_amp=False,
+        log_interval=10,
+        use_tensorboard=False,
+        use_wandb=False,
+        wandb={},
+        grad_clip=0,
+        grad_clip_norm=2,
+        swa_start=0,
+        swa_lr=1e-3,
+        swa_anneal_epochs=10,
+        cpu_offload=False,
+        input_key="x",
+        target_key=["text", "languageid"],
+    ):
+
+        loss = None
+        super_args = filter_func_args(super().__init__, locals())
+        super().__init__(**super_args)
+
+    @record
+    def train_epoch(self, data_loader):
+        """Training epoch loop
+
+        Args:
+          data_loader: pytorch data loader returning features and class labels.
+        """
+        batch_keys = [
+            self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1]
+        ]
+        metric_acc = MetricAcc(device=self.device)
+        batch_metrics = ODict()
+        self.model.train()
+        self.sp = data_loader.dataset.sp
+
+        for batch, data in enumerate(data_loader):
+            self.loggers.on_batch_begin(batch)
+
+            if batch % self.grad_acc_steps == 0:
+                self.optimizer.zero_grad()
+
+            # # TODO: Check and Modify data, target
+            # data, audio_length, target = data.to(self.device), audio_length.to(
+            #     self.device), target.to(self.device)
+            #print(data.keys(), batch_keys, flush=True)
+            input_data, input_lengths, text, languageid = tensors_subset(
+                data, batch_keys, self.device)
+            batch_size = input_data.shape[0]
+
+            with self.amp_autocast():
+                output = self.model(input_data,
+                                    x_lengths=input_lengths,
+                                    text=text,
+                                    languageid=languageid)
+                loss = output.loss
+                loss = loss.mean() / self.grad_acc_steps
+
+            if self.use_amp:
+                self.grad_scaler.scale(loss).backward()
+            else:
+                loss.backward()
+
+            if (batch + 1) % self.grad_acc_steps == 0:
+                if self.lr_scheduler is not None and not self.in_swa:
+                    self.lr_scheduler.on_opt_step()
+                self.update_model()
+
+            for k, v in output.items():
+                if "loss" in k and v is not None:
+                    batch_metrics[k] = output[k].item()
+
+            for k, metric in self.metrics.items():
+                batch_metrics[k] = metric(output, target)
+
+            metric_acc.update(batch_metrics, batch_size)
+            logs = metric_acc.metrics
+            logs["lr"] = self._get_lr()
+            self.loggers.on_batch_end(logs=logs, batch_size=batch_size)
+
+        logs = metric_acc.metrics
+        logs = ODict(("train_" + k, v) for k, v in logs.items())
+        logs["lr"] = self._get_lr()
+        return logs
+
+    def validation_epoch(self, data_loader, swa_update_bn=False):
+        """Validation epoch loop
+
+        Args:
+          data_loader: PyTorch data loader return input/output pairs.
+          sw_update_bn: wheter or not, update batch-norm layers in SWA.
+        """
+        batch_keys = [
+            self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1]
+        ]
+        metric_acc = MetricAcc(self.device)
+        batch_metrics = ODict()
+        with torch.no_grad():
+            if swa_update_bn:
+                log_tag = "train_"
+                self.train()
+            else:
+                log_tag = "val_"
+                self.model.eval()
+
+            for batch, data in enumerate(data_loader):
+
+                input_data, input_lengths, text, languageid = tensors_subset(
+                    data, batch_keys, self.device)
+                batch_size = input_data.shape[0]
+
+                # data, audio_length, target = data.to(
+                #     self.device), audio_length.to(self.device), target.to(
+                #         self.device)
+                # batch_size = data.shape[0]
+                # data, target = data.to(self.device), target.to(self.device)
+                # batch_size = data.shape[0]
+
+                with self.amp_autocast():
+                    output = self.model(input_data,
+                                        x_lengths=input_lengths,
+                                        text=text,
+                                        languageid=languageid)
+
+                for k, v in output.items():
+                    if "loss" in k and v is not None:
+                        batch_metrics[k] = output[k].item()
+
+                for k, metric in self.metrics.items():
+                    batch_metrics[k] = metric(output, target)
+
+                metric_acc.update(batch_metrics, batch_size)
+
+        logs = metric_acc.metrics
+        logs = ODict((log_tag + k, v) for k, v in logs.items())
+        return logs
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        super_skip = skip.copy()
+        super_skip.add("target_key")
+        TorchTrainer.add_class_args(parser,
+                                    train_modes=train_modes,
+                                    skip=super_skip)
+        if "target_key" not in skip:
+            parser.add_argument("--target-keys",
+                                default=["text", "languageid"],
+                                help="list of dict. key for nnet targets")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))

From 3b7e8aca6e3c9c7eb22224bfa3374957ebac52ec Mon Sep 17 00:00:00 2001
From: neillu23 <neilyenjulu@gmail.com>
Date: Sat, 22 Apr 2023 00:44:20 -0400
Subject: [PATCH 13/89] Add LID decode scripts

---
 egs/commonvoice/v1/run_030_inference.sh       |  16 +-
 egs/commonvoice/v1/run_032_identificate.sh    |  47 ++++
 .../identificate_wav2vec2resnet1d.sh          |  87 +++++++
 hyperion/bin/identificate_wav2languageid.py   | 238 ++++++++++++++++++
 .../torch/narchs/rnn_transducer_decoder.py    |   7 +-
 5 files changed, 385 insertions(+), 10 deletions(-)
 create mode 100755 egs/commonvoice/v1/run_032_identificate.sh
 create mode 100755 egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
 create mode 100755 hyperion/bin/identificate_wav2languageid.py

diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh
index cf2c8fb2..ec5b140b 100755
--- a/egs/commonvoice/v1/run_030_inference.sh
+++ b/egs/commonvoice/v1/run_030_inference.sh
@@ -7,8 +7,6 @@
 . ./path.sh
 set -e
 
-stage=0
-
 config_file=default_config.sh
 use_gpu=false
 nnet_stage=1
@@ -36,14 +34,16 @@ fi
 transducer_dir=exp/transducer/$nnet_name
 
 
+# test_data=test_clean
 
 
 # Extracts x-vectors for evaluation
-for name in $test_data  # $dev_data $test_data 
-  do
-    nj=16
-    steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj --stage $stage ${transducer_args} \
+for name in $test_data
+do
+  nj=40
+  steps_transducer/decode_wav2vec2rnn_transducer.sh \
+      --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
       $nnet data/$name \
       $transducer_dir/$name $bpe_model
-  done
-exit
+done
+
diff --git a/egs/commonvoice/v1/run_032_identificate.sh b/egs/commonvoice/v1/run_032_identificate.sh
new file mode 100755
index 00000000..a9a8cee5
--- /dev/null
+++ b/egs/commonvoice/v1/run_032_identificate.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=0
+config_file=default_config.sh
+use_gpu=false
+nnet_stage=1
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ "$use_gpu" == "true" ];then
+  lid_args="--use-gpu true"
+  lid_cmd="$cuda_eval_cmd --mem 6G"
+else
+  lid_cmd="$train_cmd --mem 12G"
+fi
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+fi
+
+lid_dir=exp/resnet1d/$nnet_name
+
+# Extracts x-vectors for evaluation
+for name in $test_data  # $dev_data $test_data 
+  do
+    nj=40
+    steps_lid/identificate_wav2vec2resnet1d.sh \
+      --cmd "$lid_cmd" --nj $nj ${lid_args} \
+      $nnet data/$name \
+      $lid_dir/$name data/$nnet_data/langs
+  done
+
+exit
diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
new file mode 100755
index 00000000..8b31ac2f
--- /dev/null
+++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#               2022  Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+nj=30
+cmd="run.pl"
+
+use_gpu=false
+write_utt2num_frames=true  # If true writes utt2num_frames.
+stage=0
+num_augs=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ] && [ $# != 4 ]; then
+  echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> [<data-out-dir>]"
+  echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --use-gpu <bool|false>                           # If true, use GPU."
+  echo "  --nj <n|30>                                      # Number of jobs"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --use-bin-vad <bool|true>                        # If true, uses binary VAD from vad.scp"
+  echo "  --write-utt2num-frames <bool|tru>                # If true, write utt2num_frames file."
+  echo "  --chunk-length <n|0>                             # If provided, applies encoder with specified chunk-length and "
+  echo "                                                   # concatenates the chunks outputs before pooling"
+  echo "  --feat-config <str>                              # feature/mvn config file"
+  echo "  --aug-config <str>                               # augmentation config file"
+  echo "  --random-utt-length                              # If true, extracts a random chunk from the utterance between "
+  echo "                                                   # min_utt_length and max_utt_length"
+  echo "  --min-utt-length <n|0>                           # "
+  echo "  --max-utt-length <n|0>                           # "
+  
+
+fi
+
+nnet_file=$1
+data_dir=$2
+output_dir=$3
+lang_file=$4
+
+for f in $data_dir/wav.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+log_dir=$output_dir/log
+mkdir -p $log_dir
+
+num_gpus=0
+args=""
+if [ "$use_gpu" == "true" ];then
+    cmd="$cmd --gpu 1"
+    num_gpus=1
+    args="--use-gpu"
+fi 
+
+if [ "$write_utt2num_frames" == "true" ];then
+    write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
+fi
+
+if [ $stage -le 0 ];then
+    set +e
+    $cmd JOB=1:$nj $output_dir/log/identificate_wav2languageid.JOB.log \
+	hyp_utils/conda_env.sh --num-gpus $num_gpus \
+    identificate_wav2languageid.py \
+    --part-idx JOB --num-parts $nj ${args} \
+    --input $data_dir/wav.scp \
+    --model-path $nnet_file \
+    --lang-file $lang_file \
+    --output $output_dir/languageid.JOB
+      set -e
+fi
+
+if [ $stage -le 1 ];then
+  echo "compute error rate"
+
+  cat $output_dir/languageid.* > $output_dir/langs
+
+  # python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
+
+  # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
+  # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text
+
+fi
diff --git a/hyperion/bin/identificate_wav2languageid.py b/hyperion/bin/identificate_wav2languageid.py
new file mode 100755
index 00000000..8b01ac25
--- /dev/null
+++ b/hyperion/bin/identificate_wav2languageid.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0) 
+"""
+
+from typing import Dict, List, Tuple
+
+import sentencepiece as spm
+import torch.nn as nn
+
+import sys
+import os
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+
+import numpy as np
+import pandas as pd
+
+import torch
+
+from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
+from hyperion.utils import Utt2Info
+from hyperion.io import DataWriterFactory as DWF
+from hyperion.io import SequentialAudioReader as AR
+from hyperion.np.augment import SpeechAugment
+
+from hyperion.torch.utils import open_device
+from hyperion.torch.narchs import AudioFeatsMVN as AF
+from hyperion.torch import TorchModelLoader as TML
+
+from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search
+
+
+def init_device(use_gpu):
+    set_float_cpu("float32")
+    num_gpus = 1 if use_gpu else 0
+    logging.info("initializing devices num_gpus={}".format(num_gpus))
+    device = open_device(num_gpus=num_gpus)
+    return device
+
+
+def load_model(model_path, device):
+    logging.info("loading model {}".format(model_path))
+    model = TML.load(model_path)
+    logging.info("lid-model={}".format(model))
+    model.to(device)
+    model.eval()
+    return model
+
+
+def decode_one_batch(
+        model: nn.Module,
+        lang_dict: Dict[int, str],
+        x: torch.Tensor) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = model.device
+    feature = x  #batch["inputs"]
+    assert x.shape[0] == 1
+    assert feature.ndim == 2
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    # feature_lens = torch.Tensor([x.shape[1]]).int()
+
+    # encoder_out, hid_feats, encoder_out_lens = model.forward_feats(
+    #     x=feature, x_lengths=feature_lens)
+
+    predictions = []
+    batch_size = feature.size(0)
+
+    # encoder_out = encoder_out.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+
+    for i in range(batch_size):
+        # # fmt: off
+        # encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]]
+        # fmt: on
+        output = model(feature)
+        _, pred = torch.max(output, dim=-1)
+        # to integer
+        pred = pred.cpu().numpy().tolist()[0]
+        predictions.append(lang_dict[pred])
+
+    logging.info("hyps:{}".format(" ".join(predictions)))
+
+    return predictions
+
+
+def decode_languageid(input_spec, output_spec, scp_sep, model_path, lang_file,
+                      use_gpu, **kwargs):
+
+    device = init_device(use_gpu)
+    model = load_model(model_path, device)
+
+    # load language dict form langfile by row number
+    lang_dict = {}
+    with open(lang_file, "r") as f:
+        for i, line in enumerate(f):
+            lang_dict[i] = line.strip()
+            
+    augmenter = None
+    aug_df = None
+    num_augs = 1
+
+    ar_args = AR.filter_args(**kwargs)
+    logging.info("opening output: %s" % (output_spec))
+    # with DWF.create(output_spec, scp_sep=scp_sep) as writer:
+    with open(output_spec, "w") as writer:
+        logging.info("opening input stream: {} with args={}".format(
+            input_spec, ar_args))
+        with AR(input_spec, **ar_args) as reader:
+            while not reader.eof():
+                t1 = time.time()
+                key, x0, fs = reader.read(1)
+                if len(key) == 0:
+                    break
+
+                x0 = x0[0]
+                key0 = key[0]
+                t2 = time.time()
+
+                logging.info("processing utt %s" % (key0))
+                for aug_id in range(num_augs):
+                    t3 = time.time()
+                    key, x = key0, x0  #augment(key0, x0, augmenter, aug_df, aug_id)
+                    t4 = time.time()
+                    with torch.no_grad():
+                        x = torch.tensor(
+                            x[None, :],
+                            dtype=torch.get_default_dtype()).to(device)
+
+                        t5 = time.time()
+                        tot_frames = x.shape[1]
+
+                        # logging.info(
+                        #     "utt %s detected %d/%d (%.2f %%) speech frames" % (
+                        #         key,
+                        #         x.shape[1],
+                        #         tot_frames,
+                        #         x.shape[1] / tot_frames * 100,
+                        #     ))
+
+                        t6 = time.time()
+                        if x.shape[1] == 0:
+                            y = np.zeros((model.embed_dim, ),
+                                         dtype=float_cpu())
+                        else:
+                            y = decode_one_batch(model=model, lang_dict=lang_dict, x=x)
+
+                    t7 = time.time()
+
+                    # writer.write([key], [y])
+                    writer.write(key + ' ' + ' '.join(y)+ "\n")
+                    t8 = time.time()
+                    read_time = t2 - t1
+                    tot_time = read_time + t8 - t3
+                    logging.info(
+                        ("utt %s total-time=%.3f read-time=%.3f "
+                         "aug-time=%.3f feat-time=%.3f "
+                         "vad-time=%.3f embed-time=%.3f write-time=%.3f "
+                         "rt-factor=%.2f") % (
+                             key,
+                             tot_time,
+                             read_time,
+                             t4 - t3,
+                             t5 - t4,
+                             t6 - t5,
+                             t7 - t6,
+                             t8 - t7,
+                             x0.shape[0] / fs[0] / tot_time,
+                         ))
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description=("Extracts x-vectors from waveform computing "
+                     "acoustic features on the fly"))
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument("--input", dest="input_spec", required=True)
+    parser.add_argument("--scp-sep",
+                        default=" ",
+                        help=("scp file field separator"))
+
+    AR.add_class_args(parser)
+
+    AF.add_class_args(parser, prefix="feats")
+
+    parser.add_argument("--model-path", required=True)
+
+    parser.add_argument("--lang-file", required=True)
+    parser.add_argument("--output", dest="output_spec", required=True)
+    parser.add_argument("--use-gpu",
+                        default=False,
+                        action="store_true",
+                        help="extract xvectors in gpu")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    decode_languageid(**namespace_to_dict(args))
diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py
index 265f2c9b..bf9189ee 100644
--- a/hyperion/torch/narchs/rnn_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_transducer_decoder.py
@@ -480,8 +480,11 @@ def decode_time_sync_beam_search(self,
                     break
             t += 1
 
-        best_hyp = max(B,
-                       key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        try:
+            best_hyp = max(B,
+                            key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        except:
+            return ""
         ys = best_hyp.ys[1:]  # [1:] to remove the blank
         return ys
 

From 35391de52990806d4802a7e034abe0dc84d675ff Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Thu, 4 May 2023 09:55:06 -0400
Subject: [PATCH 14/89] new vox2 dataprep

---
 hyperion/data_prep/data_prep.py               |   8 +-
 hyperion/data_prep/voxceleb2.py               |  16 +-
 .../torch/narchs/rnn_transducer_decoder.py    | 407 +++++++++---------
 3 files changed, 224 insertions(+), 207 deletions(-)

diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py
index d9f6b238..fb6fc6c5 100644
--- a/hyperion/data_prep/data_prep.py
+++ b/hyperion/data_prep/data_prep.py
@@ -2,6 +2,7 @@
  Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
+import logging
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 
@@ -66,21 +67,22 @@ def _get_recording_duration(scp, i, n):
     def get_recording_duration(self, recording_set):
 
         import itertools
-
         from ..utils import SCPList
 
         scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values)
         futures = []
+        logging.info("submitting threats...")
         with ThreadPoolExecutor(max_workers=self.num_threads) as pool:
-            for i in range(self.num_threads):
+            for i in tqdm(range(self.num_threads)):
                 future = pool.submit(
                     DataPrep._get_recording_duration, scp, i, self.num_threads
                 )
                 futures.append(future)
 
+        logging.info("waiting threats...")
         res = [f.result() for f in tqdm(futures)]
         fss = list(itertools.chain(*[r[0] for r in res]))
-        durations = list(itertools.chain(*[r[0] for r in res]))
+        durations = list(itertools.chain(*[r[1] for r in res]))
 
         recording_set["duration"] = durations
         recording_set["sample_freq"] = fss
diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py
index d8b9dd99..a1a9f0c3 100644
--- a/hyperion/data_prep/voxceleb2.py
+++ b/hyperion/data_prep/voxceleb2.py
@@ -158,8 +158,9 @@ def prepare(self):
             file_paths = []
             futures = []
             logging.info("making video cat lists")
+            logging.info("submitting threats...")
             with ThreadPoolExecutor(max_workers=self.num_threads) as pool:
-                for i, rec_id in enumerate(rec_ids):
+                for i, rec_id in tqdm(enumerate(rec_ids)):
                     future = pool.submit(
                         VoxCeleb2DataPrep.make_cat_list,
                         lists_cat_dir,
@@ -170,6 +171,7 @@ def prepare(self):
                     )
                     futures.append(future)
 
+            logging.info("waiting threats...")
             file_paths = [f.result() for f in tqdm(futures)]
             video_ids = uniq_video_ids
 
@@ -213,14 +215,14 @@ def prepare(self):
                     df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A"
                     for r in rec_ids
                 ],
-                # "duration": recs.loc[rec_ids, "duration"],
+                "duration": recs.loc[rec_ids, "duration"].values,
             }
         )
-        print(
-            recs.loc[rec_ids, "duration"],
-            len(segments),
-            len(recs.loc[rec_ids, "duration"]),
-        )
+        # print(
+        #     recs.loc[rec_ids, "duration"],
+        #     len(segments),
+        #     len(recs.loc[rec_ids, "duration"]),
+        # )
         segments = SegmentSet(segments)
         segments.sort()
 
diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py
index 8db6c23a..763ec67c 100644
--- a/hyperion/torch/narchs/rnn_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_transducer_decoder.py
@@ -16,7 +16,7 @@
 try:
     import k2
 except ModuleNotFoundError:
-    from ...utils import dummy_k2 as k2
+    from ..utils import dummy_k2 as k2
 
 from ...utils.misc import filter_func_args
 from ...utils.text import add_sos
@@ -99,10 +99,8 @@ def __init__(
 
         if self.rnnt_loss == "k2_pruned":
             self.simple_am_proj = nn.Linear(in_feats, vocab_size)
-            self.simple_lm_proj = nn.Linear(self.predictor.out_feats,
-                                            vocab_size)
-            self.register_buffer("cur_step", torch.as_tensor(0,
-                                                             dtype=torch.int))
+            self.simple_lm_proj = nn.Linear(self.predictor.out_feats, vocab_size)
+            self.register_buffer("cur_step", torch.as_tensor(0, dtype=torch.int))
 
     def _make_predictor(self):
         pred_type = self.predictor_args["pred_type"]
@@ -110,12 +108,10 @@ def _make_predictor(self):
         self.predictor_args["vocab_size"] = self.vocab_size
         self.predictor_args["blank_id"] = self.blank_id
         if pred_type == "rnn":
-            pred_args = filter_func_args(RNNPredictor.__init__,
-                                         self.predictor_args)
+            pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args)
             self.predictor = RNNPredictor(**pred_args)
         elif pred_type == "conv":
-            pred_args = filter_func_args(ConvPredictor.__init__,
-                                         self.predictor_args)
+            pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args)
             self.predictor = ConvPredictor(**pred_args)
             self.predictor_args["out_feats"] = self.predictor.embed_dim
         else:
@@ -127,8 +123,7 @@ def _make_joiner(self):
         if joiner_type == "basic":
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
-            self.joiner = Joiner(self.in_feats, pred_feats, hid_feats,
-                                 self.vocab_size)
+            self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, self.vocab_size)
         else:
             raise ValueError(f"Unknown joiner type {joiner_type}")
 
@@ -152,9 +147,14 @@ def get_config(self):
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor,
-                              y: torch.Tensor, y_lengths: torch.Tensor,
-                              pred_out: torch.Tensor):
+    def _rnnt_loss_torchaudio(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        y: torch.Tensor,
+        y_lengths: torch.Tensor,
+        pred_out: torch.Tensor,
+    ):
         logits = self.joiner(x, pred_out)
         # rnnt_loss requires 0 padded targets
         # Note: y does not start with SOS
@@ -170,14 +170,17 @@ def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor,
         )
         return loss
 
-    def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor,
-                      y: torch.Tensor, y_lengths: torch.Tensor,
-                      pred_out: torch.Tensor):
+    def _rnnt_loss_k2(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        y: torch.Tensor,
+        y_lengths: torch.Tensor,
+        pred_out: torch.Tensor,
+    ):
         y_padded = y.pad(mode="constant", padding_value=0)
         y_padded = y_padded.to(torch.int64)
-        boundary = torch.zeros((x.size(0), 4),
-                               dtype=torch.int64,
-                               device=x.device)
+        boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device)
         boundary[:, 2] = y_lengths
         boundary[:, 3] = x_lengths
 
@@ -195,15 +198,18 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor,
             )
         return loss
 
-    def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor,
-                             y: torch.Tensor, y_lengths: torch.Tensor,
-                             pred_out: torch.Tensor):
+    def _rnnt_loss_k2_pruned(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        y: torch.Tensor,
+        y_lengths: torch.Tensor,
+        pred_out: torch.Tensor,
+    ):
 
         y_padded = y.pad(mode="constant", padding_value=0)
         y_padded = y_padded.to(torch.int64)
-        boundary = torch.zeros((x.size(0), 4),
-                               dtype=torch.int64,
-                               device=x.device)
+        boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device)
         boundary[:, 2] = y_lengths
         boundary[:, 3] = x_lengths
 
@@ -266,7 +272,7 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor,
             simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale)
             pruned_loss_scale = 0.1 + 0.9 * r
             self.cur_step += 1
-            #print(simple_loss_scale, pruned_loss_scale)
+            # print(simple_loss_scale, pruned_loss_scale)
 
         loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned
 
@@ -288,44 +294,48 @@ def forward(
         loss_simple = loss_pruned = None
         if self.rnnt_loss == "k2_pruned":
             loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned(
-                x, x_lengths, y, y_lengths, pred_out)
+                x, x_lengths, y, y_lengths, pred_out
+            )
         elif self.rnnt_loss == "k2":
             loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out)
         elif self.rnnt_loss == "torchaudio":
             loss_simple = loss_pruned = None
-            loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths,
-                                              pred_out)
+            loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, pred_out)
 
         return loss, loss_simple, loss_pruned
 
-    def decode(self,
-               x: torch.Tensor,
-               x_lengths: torch.Tensor = None,
-               method="time_sync_beam_search",
-               beam_width: int = 5,
-               max_sym_per_frame: int = 3,
-               max_sym_per_utt: int = 1000) -> List[int]:
+    def decode(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor = None,
+        method="time_sync_beam_search",
+        beam_width: int = 5,
+        max_sym_per_frame: int = 3,
+        max_sym_per_utt: int = 1000,
+    ) -> List[int]:
         if method == "time_sync_beam_search":
-            return self.decode_time_sync_beam_search(x,
-                                                     x_lengths,
-                                                     beam_width=beam_width)
+            return self.decode_time_sync_beam_search(
+                x, x_lengths, beam_width=beam_width
+            )
         elif method == "align_length_sync_beam_search":
             return self.decode_align_length_sync_beam_search(
+                x, x_lengths, beam_width=beam_width, max_sym_per_utt=max_sym_per_utt
+            )
+        elif method == "greedy":
+            return self.decode_greedy(
                 x,
                 x_lengths,
-                beam_width=beam_width,
-                max_sym_per_utt=max_sym_per_utt)
-        elif method == "greedy":
-            return self.decode_greedy(x,
-                                      x_lengths,
-                                      max_sym_per_frame=max_sym_per_frame,
-                                      max_sym_per_utt=max_sym_per_utt)
-
-    def decode_greedy(self,
-                      x: torch.Tensor,
-                      x_lengths: torch.Tensor = None,
-                      max_sym_per_frame: int = 3,
-                      max_sym_per_utt: int = 1000) -> List[int]:
+                max_sym_per_frame=max_sym_per_frame,
+                max_sym_per_utt=max_sym_per_utt,
+            )
+
+    def decode_greedy(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor = None,
+        max_sym_per_frame: int = 3,
+        max_sym_per_utt: int = 1000,
+    ) -> List[int]:
         """
         Args:
           x: encoder embeddings with shape = (N, T, C)
@@ -339,8 +349,7 @@ def decode_greedy(self,
         blank_id = self.blank_id
         device = x.device
 
-        sos = torch.tensor([blank_id], device=device,
-                           dtype=torch.int64).reshape(1, 1)
+        sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1)
         pred_out, state = self.predictor(sos)
         T = x.size(1)
         t = 0
@@ -350,7 +359,7 @@ def decode_greedy(self,
         sym_per_utt = 0
 
         while t < T and sym_per_utt < max_sym_per_utt:
-            x_t = x[:, t:t + 1, :]
+            x_t = x[:, t : t + 1, :]
             logits = self.joiner(x_t, pred_out)  # (1, 1, 1, vocab_size)
             # logits is
 
@@ -371,10 +380,9 @@ def decode_greedy(self,
 
         return hyp
 
-    def decode_time_sync_beam_search(self,
-                                     x: torch.Tensor,
-                                     x_lengths: torch.Tensor = None,
-                                     beam_width: int = 5) -> List[int]:
+    def decode_time_sync_beam_search(
+        self, x: torch.Tensor, x_lengths: torch.Tensor = None, beam_width: int = 5
+    ) -> List[int]:
         assert x.ndim == 3
         assert x.size(0) == 1, x.size(0)
 
@@ -389,11 +397,10 @@ def decode_time_sync_beam_search(self,
         max_u = 20000  # terminate after this number of steps
         u = 0
 
-        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor,
-                                                   torch.Tensor]]] = {}
+        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {}
 
         while t < T and u < max_u:
-            x_t = x[:, t:t + 1, :]
+            x_t = x[:, t : t + 1, :]
             A = B
             B = []
 
@@ -406,13 +413,9 @@ def decode_time_sync_beam_search(self,
                 cached_key = "_".join(map(str, y_star.ys))
 
                 if cached_key not in cache:
-                    pred_in = torch.tensor([y_star.ys[-1]],
-                                           device=device).reshape(1, 1)
+                    pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1)
 
-                    pred_out, pred_state = self.predictor(
-                        pred_in,
-                        y_star.pred_state,
-                    )
+                    pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,)
                     cache[cached_key] = (pred_out, pred_state)
                 else:
                     pred_out, pred_state = cache[cached_key]
@@ -443,7 +446,7 @@ def decode_time_sync_beam_search(self,
                 topk_log_prob = log_prob.topk(beam_width, dim=-1)
 
                 # Second, choose other labels
-                #for i, v in enumerate(log_prob.tolist()):
+                # for i, v in enumerate(log_prob.tolist()):
                 for v, i in zip(*topk_log_prob):
                     v = v.item()
                     i = i.item()
@@ -452,9 +455,7 @@ def decode_time_sync_beam_search(self,
                     new_ys = y_star.ys + [i]
                     new_log_prob = y_star.log_prob + v
                     new_hyp = Hypothesis(
-                        ys=new_ys,
-                        log_prob=new_log_prob,
-                        pred_state=pred_state,
+                        ys=new_ys, log_prob=new_log_prob, pred_state=pred_state,
                     )
                     A.append(new_hyp)
 
@@ -462,12 +463,9 @@ def decode_time_sync_beam_search(self,
                 # check whether B contains more than "beam" elements more probable
                 # than the most probable in A
                 A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
-                #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
+                # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
                 B = sorted(
-                    [
-                        hyp
-                        for hyp in B if hyp.log_prob > A_most_probable.log_prob
-                    ],
+                    [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob],
                     key=lambda hyp: hyp.log_prob,
                     reverse=True,
                 )
@@ -483,17 +481,17 @@ def decode_time_sync_beam_search(self,
                     break
             t += 1
 
-        best_hyp = max(B,
-                       key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        best_hyp = max(B, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
         ys = best_hyp.ys[1:]  # [1:] to remove the blank
         return ys
 
     def decode_align_length_sync_beam_search(
-            self,
-            x: torch.Tensor,
-            x_lengths: torch.Tensor,
-            beam_width: int = 5,
-            max_sym_per_utt: int = 1000) -> List[int]:
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        beam_width: int = 5,
+        max_sym_per_utt: int = 1000,
+    ) -> List[int]:
         assert x.ndim == 3
         assert x.size(0) == 1, x.size(0)
 
@@ -503,39 +501,34 @@ def decode_align_length_sync_beam_search(
         sos = torch.tensor([blank_id], device=device).reshape(1, 1)
         pred_out, state = self.predictor(sos)
         T = x.size(1)
-        #t = 0
+        # t = 0
         B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)]
-        #max_u = 20000  # terminate after this number of steps
-        #u = 0
+        # max_u = 20000  # terminate after this number of steps
+        # u = 0
 
-        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor,
-                                                   torch.Tensor]]] = {}
+        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {}
         F = []
-        #for t < T and u < max_u:
+        # for t < T and u < max_u:
         for i in range(T + max_sym_per_utt):
             A = []
             for y_star in B:
-                #while u < max_u:
+                # while u < max_u:
                 u = len(y_star.ys) - 1
                 t = i - u
                 if t >= T:
                     continue
 
-                #y_star = max(A, key=lambda hyp: hyp.log_prob)
-                #A.remove(y_star)
-                x_t = x[:, t:t + 1, :]
+                # y_star = max(A, key=lambda hyp: hyp.log_prob)
+                # A.remove(y_star)
+                x_t = x[:, t : t + 1, :]
                 # Note: y_star.ys is unhashable, i.e., cannot be used
                 # as a key into a dict
                 cached_key = "_".join(map(str, y_star.ys))
 
                 if cached_key not in cache:
-                    pred_in = torch.tensor([y_star.ys[-1]],
-                                           device=device).reshape(1, 1)
+                    pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1)
 
-                    pred_out, pred_state = self.predictor(
-                        pred_in,
-                        y_star.pred_state,
-                    )
+                    pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,)
                     cache[cached_key] = (pred_out, pred_state)
                 else:
                     pred_out, pred_state = cache[cached_key]
@@ -563,7 +556,7 @@ def decode_align_length_sync_beam_search(
                 topk_log_prob = log_prob.topk(beam_width, dim=-1)
 
                 # Second, choose other labels
-                #for i, v in enumerate(log_prob.tolist()):
+                # for i, v in enumerate(log_prob.tolist()):
                 for v, i in zip(*topk_log_prob):
                     v = v.item()
                     i = i.item()
@@ -572,20 +565,16 @@ def decode_align_length_sync_beam_search(
                     new_ys = y_star.ys + [i]
                     new_log_prob = y_star.log_prob + v
                     new_hyp = Hypothesis(
-                        ys=new_ys,
-                        log_prob=new_log_prob,
-                        pred_state=pred_state,
+                        ys=new_ys, log_prob=new_log_prob, pred_state=pred_state,
                     )
                     A.append(new_hyp)
 
                 # check whether B contains more than "beam_width" elements more probable
                 # than the most probable in A
-                #A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
-                #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
+                # A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
+                # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
                 B0 = sorted(
-                    [hyp for hyp in A],
-                    key=lambda hyp: hyp.log_prob,
-                    reverse=True,
+                    [hyp for hyp in A], key=lambda hyp: hyp.log_prob, reverse=True,
                 )
                 B = []
                 B_ys = set()
@@ -605,8 +594,7 @@ def decode_align_length_sync_beam_search(
                     B = B[:beam_width]
                     break
 
-        best_hyp = max(F,
-                       key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        best_hyp = max(F, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
         ys = best_hyp.ys[1:]  # [1:] to remove the blank
         return ys
 
@@ -617,8 +605,9 @@ def change_config(
         rnn_dropout_rate: float = 0.0,
     ):
         logging.info("changing decoder config")
-        self.predictor.change_config(override_dropouts, embed_dropout_rate,
-                                     rnn_dropout_rate)
+        self.predictor.change_config(
+            override_dropouts, embed_dropout_rate, rnn_dropout_rate
+        )
 
     @staticmethod
     def filter_args(**kwargs):
@@ -638,49 +627,58 @@ def add_pred_args(parser):
             "--pred-type",
             default="rnn",
             choices=["rnn", "conv"],
-            help=
-            """type of predictor between RNN and Convolutional [rnn, conv]""")
-        pred_parser.add_argument("--embed-dim",
-                                 default=1024,
-                                 type=int,
-                                 help=("token embedding dimension"))
+            help="""type of predictor between RNN and Convolutional [rnn, conv]""",
+        )
+        pred_parser.add_argument(
+            "--embed-dim", default=1024, type=int, help=("token embedding dimension")
+        )
         pred_parser.add_argument(
             "--embed-dropout-rate",
             default=0.0,
             type=float,
-            help=("dropout prob for predictor input embeddings"))
-        pred_parser.add_argument("--rnn-dropout-rate",
-                                 default=0.0,
-                                 type=float,
-                                 help="""dropout prob for decoder RNN """)
+            help=("dropout prob for predictor input embeddings"),
+        )
+        pred_parser.add_argument(
+            "--rnn-dropout-rate",
+            default=0.0,
+            type=float,
+            help="""dropout prob for decoder RNN """,
+        )
         pred_parser.add_argument(
             "--rnn-type",
             default="lstm",
             choices=["lstm", "gru"],
-            help=
-            """type of recurrent network for thep predictor in [lstm, gru]""")
-
-        pred_parser.add_argument("--num-layers",
-                                 default=2,
-                                 type=int,
-                                 help="""number of layers of the predictor """)
-
-        pred_parser.add_argument("--hid-feats",
-                                 default=512,
-                                 type=int,
-                                 help="""hidden features of the predictor""")
-        pred_parser.add_argument("--out-feats",
-                                 default=512,
-                                 type=int,
-                                 help="""output features of the predictor""")
-        pred_parser.add_argument("--context-size",
-                                 default=2,
-                                 type=int,
-                                 help="""context length of the convolutional 
-                                 predictor, 1->bigram, 2-> trigram,...""")
-
-        parser.add_argument("--predictor",
-                            action=ActionParser(parser=pred_parser))
+            help="""type of recurrent network for thep predictor in [lstm, gru]""",
+        )
+
+        pred_parser.add_argument(
+            "--num-layers",
+            default=2,
+            type=int,
+            help="""number of layers of the predictor """,
+        )
+
+        pred_parser.add_argument(
+            "--hid-feats",
+            default=512,
+            type=int,
+            help="""hidden features of the predictor""",
+        )
+        pred_parser.add_argument(
+            "--out-feats",
+            default=512,
+            type=int,
+            help="""output features of the predictor""",
+        )
+        pred_parser.add_argument(
+            "--context-size",
+            default=2,
+            type=int,
+            help="""context length of the convolutional 
+                                 predictor, 1->bigram, 2-> trigram,...""",
+        )
+
+        parser.add_argument("--predictor", action=ActionParser(parser=pred_parser))
 
     @staticmethod
     def add_joiner_args(parser):
@@ -690,39 +688,43 @@ def add_joiner_args(parser):
             "--joiner-type",
             default="basic",
             choices=["basic"],
-            help=
-            """type of joiner network, there is only basic joiner for now""")
-        pred_parser.add_argument("--hid-feats",
-                                 default=512,
-                                 type=int,
-                                 help="""hidden features of the joiner""")
-        parser.add_argument("--joiner",
-                            action=ActionParser(parser=pred_parser))
+            help="""type of joiner network, there is only basic joiner for now""",
+        )
+        pred_parser.add_argument(
+            "--hid-feats",
+            default=512,
+            type=int,
+            help="""hidden features of the joiner""",
+        )
+        parser.add_argument("--joiner", action=ActionParser(parser=pred_parser))
 
     @staticmethod
-    def add_class_args(parser,
-                       prefix=None,
-                       skip=set(["in_feats", "blank_id", "vocab_size"])):
+    def add_class_args(
+        parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"])
+    ):
 
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
         if "in_feats" not in skip:
-            parser.add_argument("--in-feats",
-                                type=int,
-                                required=True,
-                                help=("input feature dimension"))
+            parser.add_argument(
+                "--in-feats", type=int, required=True, help=("input feature dimension")
+            )
         if "blank_id" not in skip:
-            parser.add_argument("--blank-id",
-                                type=int,
-                                default=0,
-                                help=("blank id from tokenizer model"))
+            parser.add_argument(
+                "--blank-id",
+                type=int,
+                default=0,
+                help=("blank id from tokenizer model"),
+            )
         if "vocab_size" not in skip:
-            parser.add_argument("--vocab-size",
-                                type=int,
-                                required=True,
-                                help=("output prediction dimension"))
+            parser.add_argument(
+                "--vocab-size",
+                type=int,
+                required=True,
+                help=("output prediction dimension"),
+            )
 
         RNNTransducerDecoder.add_pred_args(parser)
         RNNTransducerDecoder.add_joiner_args(parser)
@@ -730,56 +732,62 @@ def add_class_args(parser,
             "--rnnt-loss",
             default="k2_pruned",
             choices=["torchaudio", "k2", "k2_pruned"],
-            help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""")
+            help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""",
+        )
         parser.add_argument(
             "--rnnt-type",
             default="regular",
             choices=["regular", "modified", "constrained"],
-            help=
-            """type of rnn-t loss between regular, modified or constrained.""")
+            help="""type of rnn-t loss between regular, modified or constrained.""",
+        )
         parser.add_argument(
             "--delay-penalty",
             default=0.0,
             type=float,
-            help=
-            """penalize symbol delay, which is used to make symbol emit earlier
-            for streaming models.""")
+            help="""penalize symbol delay, which is used to make symbol emit earlier
+            for streaming models.""",
+        )
         parser.add_argument(
             "--reduction",
             default="sum",
             choices=["sum", "mean"],
-            help="""type of reduction for rnn-t loss between sum or mean""")
+            help="""type of reduction for rnn-t loss between sum or mean""",
+        )
         parser.add_argument(
             "--prune-range",
             default=5,
             type=int,
             help="""how many symbols to keep for each frame in k2 rnn-t 
-            pruned loss.""")
+            pruned loss.""",
+        )
         parser.add_argument(
             "--lm-scale",
             default=0.25,
             type=float,
-            help="""language model scale in rnn-t smoothed loss""")
+            help="""language model scale in rnn-t smoothed loss""",
+        )
         parser.add_argument(
             "--am-scale",
             default=0.0,
             type=float,
-            help="""acoustic model scale in rnn-t smoothed loss""")
+            help="""acoustic model scale in rnn-t smoothed loss""",
+        )
         parser.add_argument(
             "--simple-loss-scale",
             default=0.5,
             type=float,
-            help="""weight of rnn-t simple loss when using k2 pruned loss""")
+            help="""weight of rnn-t simple loss when using k2 pruned loss""",
+        )
         parser.add_argument(
             "--pruned-warmup-steps",
             default=2000,
             type=int,
             help="""number of steps to warm up the k2 rnn-t pruned loss 
-            from 0.1 to 1""")
+            from 0.1 to 1""",
+        )
 
         if prefix is not None:
-            outer_parser.add_argument("--" + prefix,
-                                      action=ActionParser(parser=parser))
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
 
     @staticmethod
     def add_finetune_args(parser, prefix=None, skip=set()):
@@ -794,16 +802,21 @@ def add_finetune_args(parser, prefix=None, skip=set()):
             action=ActionYesNo,
             help=(
                 "whether to use the dropout probabilities passed in the "
-                "arguments instead of the defaults in the pretrained model."))
-        parser.add_argument("--embed-dropout-rate",
-                            default=0.0,
-                            type=float,
-                            help=("dropout prob for decoder input embeddings"))
-        parser.add_argument("--rnn-dropout-rate",
-                            default=0.0,
-                            type=float,
-                            help=("dropout prob for decoder RNN "))
+                "arguments instead of the defaults in the pretrained model."
+            ),
+        )
+        parser.add_argument(
+            "--embed-dropout-rate",
+            default=0.0,
+            type=float,
+            help=("dropout prob for decoder input embeddings"),
+        )
+        parser.add_argument(
+            "--rnn-dropout-rate",
+            default=0.0,
+            type=float,
+            help=("dropout prob for decoder RNN "),
+        )
 
         if prefix is not None:
-            outer_parser.add_argument("--" + prefix,
-                                      action=ActionParser(parser=parser))
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))

From ebef85146384fa08995a816e3843d33a1e2e8673 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Thu, 4 May 2023 17:55:23 -0400
Subject: [PATCH 15/89] update the np.str to np.str_

---
 hyperion/utils/utt2info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py
index 9785d021..edf2c23a 100644
--- a/hyperion/utils/utt2info.py
+++ b/hyperion/utils/utt2info.py
@@ -142,7 +142,7 @@ def save(self, file_path, sep=" "):
         self.utt_info.to_csv(file_path, sep=sep, header=False, index=False)
 
     @classmethod
-    def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}):
+    def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}):
         """Loads utt2info list from text file.
 
         Args:

From 720bd6eefd4fabda168fc1903876d615f4668be3 Mon Sep 17 00:00:00 2001
From: neillu23 <neilyenjulu@gmail.com>
Date: Thu, 4 May 2023 17:58:48 -0400
Subject: [PATCH 16/89] update np.str to np.str_

---
 hyperion/utils/utt2info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py
index 9785d021..edf2c23a 100644
--- a/hyperion/utils/utt2info.py
+++ b/hyperion/utils/utt2info.py
@@ -142,7 +142,7 @@ def save(self, file_path, sep=" "):
         self.utt_info.to_csv(file_path, sep=sep, header=False, index=False)
 
     @classmethod
-    def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}):
+    def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}):
         """Loads utt2info list from text file.
 
         Args:

From b112ebde8adc127156c2b111c21e1afe3042754d Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Thu, 4 May 2023 21:42:51 -0400
Subject: [PATCH 17/89] Add empty __init__.py

---
 hyperion/torch/models/vae/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 hyperion/torch/models/vae/__init__.py

diff --git a/hyperion/torch/models/vae/__init__.py b/hyperion/torch/models/vae/__init__.py
new file mode 100644
index 00000000..f4883a15
--- /dev/null
+++ b/hyperion/torch/models/vae/__init__.py
@@ -0,0 +1,5 @@
+"""
+ Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""

From cf861bc7b30f9c318ed20308588c71856a545933 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Mon, 8 May 2023 14:49:09 -0400
Subject: [PATCH 18/89] fix new vox2 dataprep durations, scp -> RecordingSet

---
 egs/librispeech/v1/run_011_train_asr.sh       |  12 +-
 egs/librispeech/v1/run_011_train_asr_old.sh   |  12 +-
 .../adv.v1.1/run_005_train_victim_xvector.sh  |   4 +-
 .../run_007_train_transfer_xvector.sh         |   4 +-
 .../run_008_adv_finetune_victim_xvector.sh    |   4 +-
 .../adv.v2/run_011_train_victim_xvector.sh    |   4 +-
 .../run_022_attack_type_classif_allknown.sh   |   4 +-
 .../adv.v2/run_023_snr_classif_allknown.sh    |   4 +-
 .../run_024_threat_model_classif_allknown.sh  |   4 +-
 ...un_031_attack_type_verif_and_noveltydet.sh |   4 +-
 egs/voxceleb/adv.v2/run_032_snr_verif.sh      |   4 +-
 .../adv.v2/run_033_threat_model_verif.sh      |   4 +-
 .../config_fbank80_stmn_cfwseresnet34.v3.0.sh |   4 +-
 .../config_fbank80_stmn_fwseresnet34.v3.0.sh  |   4 +-
 .../config_fbank80_stmn_resnet34.v3.0.sh      |   2 +-
 egs/voxceleb/v1.1/run_011_train_xvector.sh    |   8 +-
 egs/voxceleb/v2/run_011_train_xvector.sh      |  12 +-
 hyperion/bin/apply_mvn_select_frames.py       |  36 +-
 hyperion/bin/compute_energy_vad.py            |  21 +-
 hyperion/bin/compute_mfcc_feats.py            |  20 +-
 hyperion/bin/decode_wav2transducer.py         | 110 ++--
 ...l_xvec_cosine_scoring_from_adv_test_wav.py |  21 +-
 ...osine_scoring_from_adv_test_wav_wavegan.py |  22 +-
 ...l_xvec_cosine_scoring_from_art_test_wav.py |  26 +-
 .../eval_xvec_cosine_scoring_from_test_wav.py |  15 +-
 ...sine_scoring_from_transfer_adv_test_wav.py |   3 +-
 ...sine_scoring_from_transfer_art_test_wav.py |  20 +-
 hyperion/bin/eval_xvec_logits_from_wav.py     |  19 +-
 hyperion/bin/extract_wav2vec2xvectors.py      |  17 +-
 hyperion/bin/extract_xvectors_from_wav.py     |  16 +-
 .../extract_xvectors_slidwin_from_feats.py    |  15 +-
 .../bin/extract_xvectors_slidwin_from_wav.py  |  18 +-
 .../generate_adv_attacks_xvector_classif.py   |  15 +-
 .../bin/generate_adv_attacks_xvector_verif.py |  10 +-
 hyperion/bin/pack_wav_rirs.py                 |  10 +-
 hyperion/data_prep/data_prep.py               |   8 +-
 hyperion/io/ark_data_reader.py                | 179 ++++---
 hyperion/io/ark_data_writer.py                |  42 +-
 hyperion/io/audio_reader.py                   | 409 ++++++++-------
 hyperion/io/audio_writer.py                   |  84 +--
 hyperion/io/bin_vad_reader.py                 |   3 +-
 hyperion/io/data_reader.py                    |  62 ++-
 hyperion/io/data_rw_factory.py                |  51 +-
 hyperion/io/data_writer.py                    |  51 +-
 hyperion/io/h5_data_reader.py                 | 204 +++++---
 hyperion/io/h5_data_writer.py                 |  32 +-
 hyperion/io/old_audio_reader.py               | 477 ++++++++++++++++++
 hyperion/io/vad_rw_factory.py                 |  10 +-
 hyperion/torch/data/audio_dataset.py          | 160 +++---
 hyperion/utils/feature_set.py                 |  16 +-
 hyperion/utils/info_table.py                  |  27 +-
 hyperion/utils/segment_set.py                 |  27 +
 hyperion/utils/utt2info.py                    |   2 +-
 53 files changed, 1525 insertions(+), 827 deletions(-)
 create mode 100644 hyperion/io/old_audio_reader.py

diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh
index 99b0065e..81ebbeae 100755
--- a/egs/librispeech/v1/run_011_train_asr.sh
+++ b/egs/librispeech/v1/run_011_train_asr.sh
@@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2rnn_transducer.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s1_dir $args \
@@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
@@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
diff --git a/egs/librispeech/v1/run_011_train_asr_old.sh b/egs/librispeech/v1/run_011_train_asr_old.sh
index 3d0e6eb1..3c9f4f5b 100755
--- a/egs/librispeech/v1/run_011_train_asr_old.sh
+++ b/egs/librispeech/v1/run_011_train_asr_old.sh
@@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s1_dir $args \
@@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
@@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2spk \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2spk \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh
index 37a91211..aa779902 100755
--- a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh
+++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh
@@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then
     --gpu $ngpu $nnet_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_dir \
diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh
index 70bab280..420ac59d 100755
--- a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh
+++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh
@@ -54,11 +54,11 @@ if [ $stage -le 1 ]; then
     --gpu $ngpu $nnet_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_dir \
diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh
index 12f1e5fd..4f2c137b 100755
--- a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh
+++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh
@@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then
     --gpu $ngpu $nnet_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_dir \
diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh
index 971b88a3..a1acb1f6 100755
--- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh
+++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh
@@ -40,11 +40,11 @@ if [ $stage -le 1 ]; then
     --gpu $ngpu $nnet_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_dir \
diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh
index 71c0c89f..b453260f 100755
--- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh
+++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh
@@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	--data.train.dataset.audio-file $list_dir/trainval_wav.scp \
+	--data.train.dataset.recordings-file $list_dir/trainval_wav.scp \
 	--data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	--data.train.dataset.segments-file $list_dir/train_utt2attack \
 	--data.train.dataset.class-file $list_dir/class_file \
-	--data.val.dataset.audio-file $list_dir/trainval_wav.scp \
+	--data.val.dataset.recordings-file $list_dir/trainval_wav.scp \
 	--data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	--data.val.dataset.segments-file $list_dir/val_utt2attack \
 	--trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh
index a928ae29..de811505 100755
--- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh
+++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh
@@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	--data.train.dataset.audio-file $list_dir/trainval_wav.scp \
+	--data.train.dataset.recordings-file $list_dir/trainval_wav.scp \
 	--data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	--data.train.dataset.segments-file $list_dir/train_utt2attack \
 	--data.train.dataset.class-file $list_dir/class_file \
-	--data.val.dataset.audio-file $list_dir/trainval_wav.scp \
+	--data.val.dataset.recordings-file $list_dir/trainval_wav.scp \
 	--data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	--data.val.dataset.segments-file $list_dir/val_utt2attack \
 	--trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh
index bed225a3..aa17a1ae 100755
--- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh
+++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh
@@ -48,11 +48,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	      train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	      --data.train.dataset.audio-file $list_dir/trainval_wav.scp \
+	      --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \
 	      --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	      --data.train.dataset.segments-file $list_dir/train_utt2attack \
 	      --data.train.dataset.class-file $list_dir/class_file \
-	      --data.val.dataset.audio-file $list_dir/trainval_wav.scp \
+	      --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \
 	      --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \
 	      --data.val.dataset.segments-file $list_dir/val_utt2attack \
 	      --trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
index 55cb8459..4ce703ba 100755
--- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
+++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
@@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	      train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	      --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \
 	      --data.train.dataset.class-file $list_someknown_dir/class_file \
-	      --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \
 	      --trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh
index 3886c339..12d42c99 100755
--- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh
+++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh
@@ -52,11 +52,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	      train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	      --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \
 	      --data.train.dataset.class-file $list_someknown_dir/class_file \
-	      --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \
 	      --trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh
index 392bffb5..cbfaaa81 100755
--- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh
+++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh
@@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then
     $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \
 	      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
 	      train_xvector_from_wav.py  $sign_nnet_command --cfg $sign_nnet_config \
-	      --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \
 	      --data.train.dataset.class-file $list_someknown_dir/class_file \
-	      --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \
+	      --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \
 	      --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \
 	      --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \
 	      --trainer.exp-path $sign_nnet_dir $args \
diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
index 32c91da2..fdb3147f 100644
--- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
+++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
@@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
 
 # back-end
 do_plda=false
-do_snorm=false #true
-do_qmf=false #true
+do_snorm=true
+do_qmf=true
 do_voxsrc22=true
 
 plda_aug_config=conf/reverb_noise_aug.yaml
diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
index 62b02c28..7aa61f00 100644
--- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
+++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
@@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
 
 # back-end
 do_plda=false
-do_snorm=true
-do_qmf=true
+do_snorm=false #true
+do_qmf=false #true
 do_voxsrc22=true
 
 plda_aug_config=conf/reverb_noise_aug.yaml
diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
index c49936e0..b194d1bd 100644
--- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
+++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
@@ -16,7 +16,7 @@ nnet_name=${feat_type}_resnet34.v3.0
 
 nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml
 nnet_s1_name=$nnet_name.s1
-nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2
 nnet_s1=$nnet_s1_dir/model_ep0035.pth
 
 nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml
diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh
index a051c136..c8ab552e 100755
--- a/egs/voxceleb/v1.1/run_011_train_xvector.sh
+++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh
@@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then
     --gpu $ngpu $nnet_s1_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_s1_dir \
@@ -67,11 +67,11 @@ if [ $stage -le 2 ]; then
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --in-model-file $nnet_s1 \
diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh
index 0eddb1a6..bc3b5420 100755
--- a/egs/voxceleb/v2/run_011_train_xvector.sh
+++ b/egs/voxceleb/v2/run_011_train_xvector.sh
@@ -47,11 +47,11 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     train_wav2vec2xvector.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --trainer.exp-path $nnet_s1_dir $args \
@@ -71,11 +71,11 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2xvector.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --in-model-file $nnet_s1 \
@@ -96,11 +96,11 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2xvector.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $list_dir/wav.scp \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
     --data.train.dataset.time-durs-file $list_dir/utt2dur \
     --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
     --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
-    --data.val.dataset.audio-file $list_dir/wav.scp \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
     --data.val.dataset.time-durs-file $list_dir/utt2dur \
     --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
     --in-model-file $nnet_s2 \
diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py
index a2456dc9..f5a3ce15 100755
--- a/hyperion/bin/apply_mvn_select_frames.py
+++ b/hyperion/bin/apply_mvn_select_frames.py
@@ -10,8 +10,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
@@ -28,7 +32,6 @@ def process_feats(
     output_spec,
     vad_spec,
     write_num_frames_spec,
-    scp_sep,
     path_prefix,
     vad_path_prefix,
     part_idx,
@@ -51,25 +54,16 @@ def process_feats(
 
     logging.info("opening output stream: %s" % (output_spec))
     with DWF.create(
-        output_spec,
-        compress=compress,
-        compression_method=compression_method,
-        scp_sep=scp_sep,
+        output_spec, compress=compress, compression_method=compression_method,
     ) as writer:
 
         logging.info("opening input stream: %s" % (output_spec))
         with DRF.create(
-            input_spec,
-            path_prefix=path_prefix,
-            scp_sep=scp_sep,
-            part_idx=part_idx,
-            num_parts=num_parts,
+            input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts,
         ) as reader:
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s" % (vad_spec))
-                v_reader = RDRF.create(
-                    vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep
-                )
+                v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
                 key, data = reader.read(1)
@@ -112,28 +106,20 @@ def process_feats(
     parser.add_argument(
         "--write-num-frames", dest="write_num_frames_spec", default=None
     )
-    parser.add_argument(
-        "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator")
-    )
     parser.add_argument(
         "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix")
     )
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
     parser.add_argument(
         "--part-idx",
-        dest="part_idx",
         type=int,
         default=1,
         help=("splits the list of files in num-parts and process part_idx"),
     )
     parser.add_argument(
         "--num-parts",
-        dest="num_parts",
         type=int,
         default=1,
         help=("splits the list of files in num-parts and process part_idx"),
@@ -141,14 +127,12 @@ def process_feats(
 
     parser.add_argument(
         "--compress",
-        dest="compress",
         default=False,
         action="store_true",
         help="Lossy compress the features",
     )
     parser.add_argument(
         "--compression-method",
-        dest="compression_method",
         default="auto",
         choices=compression_methods,
         help=(
diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py
index 15d74f3a..058f982a 100755
--- a/hyperion/bin/compute_energy_vad.py
+++ b/hyperion/bin/compute_energy_vad.py
@@ -9,8 +9,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
@@ -26,14 +30,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs):
     input_args = AR.filter_args(**kwargs)
     reader = AR(input_path, **input_args)
 
-    writer = DWF.create(output_path, scp_sep=" ")
+    writer = DWF.create(output_path)
 
     if write_num_frames is not None:
         f_num_frames = open(write_num_frames, "w")
 
     for data in reader:
         key, x, fs = data
-        logging.info("Extracting VAD for %s" % (key))
+        logging.info("Extracting VAD for %s", key)
         t1 = time.time()
         y = vad.compute(x)
         dt = (time.time() - t1) * 1000
@@ -41,8 +45,13 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs):
         num_speech_frames = np.sum(y)
         prob_speech = num_speech_frames / y.shape[0] * 100
         logging.info(
-            "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f"
-            % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf)
+            "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f",
+            key,
+            num_speech_frames,
+            y.shape[0],
+            prob_speech,
+            dt,
+            rtf,
         )
         writer.write([key], [y])
         if write_num_frames is not None:
diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py
index a83f95d1..ca6e26f7 100755
--- a/hyperion/bin/compute_mfcc_feats.py
+++ b/hyperion/bin/compute_mfcc_feats.py
@@ -9,8 +9,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
@@ -35,10 +39,7 @@ def compute_mfcc_feats(
         reader = DRF.create(input_path, **input_args)
 
     writer = DWF.create(
-        output_path,
-        scp_sep=" ",
-        compress=compress,
-        compression_method=compression_method,
+        output_path, compress=compress, compression_method=compression_method,
     )
 
     if write_num_frames is not None:
@@ -55,8 +56,11 @@ def compute_mfcc_feats(
         dt = (time.time() - t1) * 1000
         rtf = dt / (mfcc.frame_shift * y.shape[0])
         logging.info(
-            "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f"
-            % (key, y.shape[0], dt, rtf)
+            "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f",
+            key,
+            y.shape[0],
+            dt,
+            rtf,
         )
         writer.write([key], [y])
 
diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py
index 81fa8803..c7de38f1 100755
--- a/hyperion/bin/decode_wav2transducer.py
+++ b/hyperion/bin/decode_wav2transducer.py
@@ -13,8 +13,12 @@
 import numpy as np
 import pandas as pd
 import sentencepiece as spm
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -23,8 +27,7 @@
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.augment import SpeechAugment
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.models.wav2transducer.beam_search import (beam_search,
-                                                              greedy_search)
+from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
@@ -48,10 +51,11 @@ def load_model(model_path, device):
 
 
 def decode_one_batch(
-        model: nn.Module,
-        sp: spm.SentencePieceProcessor,
-        x: torch.Tensor,
-        decoding_method="beam_search") -> Dict[str, List[List[str]]]:
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    x: torch.Tensor,
+    decoding_method="beam_search",
+) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
         - key: It indicates the setting used for decoding. For example,
@@ -77,7 +81,7 @@ def decode_one_batch(
       the returned dict.
     """
     device = model.device
-    feature = x  #batch["inputs"]
+    feature = x  # batch["inputs"]
     assert x.shape[0] == 1
     assert feature.ndim == 2
 
@@ -87,7 +91,8 @@ def decode_one_batch(
     feature_lens = torch.Tensor([x.shape[1]]).int()
 
     encoder_out, hid_feats, encoder_out_lens = model.forward_feats(
-        x=feature, x_lengths=feature_lens)
+        x=feature, x_lengths=feature_lens
+    )
 
     hyps = []
     batch_size = encoder_out.size(0)
@@ -114,8 +119,9 @@ def decode_one_batch(
         return hyps[0]
 
 
-def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
-                      use_gpu, **kwargs):
+def decode_transducer(
+    input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs
+):
 
     device = init_device(use_gpu)
     model = load_model(model_path, device)
@@ -129,10 +135,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
 
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output: %s" % (output_spec))
-    # with DWF.create(output_spec, scp_sep=scp_sep) as writer:
     with open(output_spec, "w") as writer:
-        logging.info("opening input stream: {} with args={}".format(
-            input_spec, ar_args))
+        logging.info(
+            "opening input stream: {} with args={}".format(input_spec, ar_args)
+        )
         with AR(input_spec, **ar_args) as reader:
             while not reader.eof():
                 t1 = time.time()
@@ -147,65 +153,69 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
                 logging.info("processing utt %s" % (key0))
                 for aug_id in range(num_augs):
                     t3 = time.time()
-                    key, x = key0, x0  #augment(key0, x0, augmenter, aug_df, aug_id)
+                    key, x = key0, x0  # augment(key0, x0, augmenter, aug_df, aug_id)
                     t4 = time.time()
                     with torch.no_grad():
                         x = torch.tensor(
-                            x[None, :],
-                            dtype=torch.get_default_dtype()).to(device)
+                            x[None, :], dtype=torch.get_default_dtype()
+                        ).to(device)
 
                         t5 = time.time()
                         tot_frames = x.shape[1]
 
                         logging.info(
-                            "utt %s detected %d/%d (%.2f %%) speech frames" % (
+                            "utt %s detected %d/%d (%.2f %%) speech frames"
+                            % (
                                 key,
                                 x.shape[1],
                                 tot_frames,
                                 x.shape[1] / tot_frames * 100,
-                            ))
+                            )
+                        )
 
                         t6 = time.time()
                         if x.shape[1] == 0:
-                            y = np.zeros((model.embed_dim, ),
-                                         dtype=float_cpu())
+                            y = np.zeros((model.embed_dim,), dtype=float_cpu())
                         else:
                             y = decode_one_batch(model=model, sp=sp, x=x)
 
                     t7 = time.time()
-                    writer.write(key + ' ' + ' '.join(y) + "\n")
+                    writer.write(key + " " + " ".join(y) + "\n")
 
                     t8 = time.time()
                     read_time = t2 - t1
                     tot_time = read_time + t8 - t3
                     logging.info(
-                        ("utt %s total-time=%.3f read-time=%.3f "
-                         "aug-time=%.3f feat-time=%.3f "
-                         "vad-time=%.3f embed-time=%.3f write-time=%.3f "
-                         "rt-factor=%.2f") % (
-                             key,
-                             tot_time,
-                             read_time,
-                             t4 - t3,
-                             t5 - t4,
-                             t6 - t5,
-                             t7 - t6,
-                             t8 - t7,
-                             x0.shape[0] / fs[0] / tot_time,
-                         ))
+                        (
+                            "utt %s total-time=%.3f read-time=%.3f "
+                            "aug-time=%.3f feat-time=%.3f "
+                            "vad-time=%.3f embed-time=%.3f write-time=%.3f "
+                            "rt-factor=%.2f"
+                        )
+                        % (
+                            key,
+                            tot_time,
+                            read_time,
+                            t4 - t3,
+                            t5 - t4,
+                            t6 - t5,
+                            t7 - t6,
+                            t8 - t7,
+                            x0.shape[0] / fs[0] / tot_time,
+                        )
+                    )
 
 
 if __name__ == "__main__":
 
     parser = ArgumentParser(
-        description=("Extracts x-vectors from waveform computing "
-                     "acoustic features on the fly"))
+        description=(
+            "Extracts x-vectors from waveform computing " "acoustic features on the fly"
+        )
+    )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
     parser.add_argument("--input", dest="input_spec", required=True)
-    parser.add_argument("--scp-sep",
-                        default=" ",
-                        help=("scp file field separator"))
 
     AR.add_class_args(parser)
 
@@ -216,16 +226,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
     parser.add_argument("--bpe-model", required=True)
 
     parser.add_argument("--output", dest="output_spec", required=True)
-    parser.add_argument("--use-gpu",
-                        default=False,
-                        action="store_true",
-                        help="extract xvectors in gpu")
-    parser.add_argument("-v",
-                        "--verbose",
-                        dest="verbose",
-                        default=1,
-                        choices=[0, 1, 2, 3],
-                        type=int)
+    parser.add_argument(
+        "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu"
+    )
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
 
     args = parser.parse_args()
     config_logger(args.verbose)
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
index bb01162f..10ea491c 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
@@ -10,8 +10,12 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -188,7 +192,7 @@ def eval_cosine_scoring(
     attack = AttackFactory.create(model, **attack_args)
     if vad_spec is not None:
         logging.info("opening VAD stream: %s", vad_spec)
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -327,9 +331,9 @@ def eval_cosine_scoring(
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--v-file", dest="v_file", required=True)
-    parser.add_argument("--key-file", dest="key_file", default=None)
-    parser.add_argument("--enroll-file", dest="enroll_file", required=True)
+    parser.add_argument("--v-file", required=True)
+    parser.add_argument("--key-file", default=None)
+    parser.add_argument("--enroll-file", required=True)
     parser.add_argument("--test-wav-file", required=True)
 
     AR.add_class_args(parser)
@@ -337,10 +341,7 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
index c483ce39..a6f535b3 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
@@ -7,13 +7,18 @@
 import os
 import sys
 import time
+
 # [Added Sonal May21]
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -243,7 +248,7 @@ def eval_cosine_scoring_wavegan(
     attack = AttackFactory.create(model, **attack_args)
     if vad_spec is not None:
         logging.info("opening VAD stream: %s" % (vad_spec))
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -384,9 +389,9 @@ def eval_cosine_scoring_wavegan(
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--v-file", dest="v_file", required=True)
-    parser.add_argument("--key-file", dest="key_file", default=None)
-    parser.add_argument("--enroll-file", dest="enroll_file", required=True)
+    parser.add_argument("--v-file", required=True)
+    parser.add_argument("--key-file", default=None)
+    parser.add_argument("--enroll-file", required=True)
     parser.add_argument("--test-wav-file", required=True)
 
     AR.add_class_args(parser)
@@ -394,10 +399,7 @@ def eval_cosine_scoring_wavegan(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
index fba182c4..5ba42477 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
@@ -13,8 +13,12 @@
 import pandas as pd
 from art.classifiers import PyTorchClassifier
 from art.estimators.classification import PyTorchClassifier
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -25,8 +29,9 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import \
-    ARTAttackFactory as AttackFactory
+from hyperion.torch.adv_attacks.art_attack_factory import (
+    ARTAttackFactory as AttackFactory,
+)
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
@@ -195,7 +200,7 @@ def eval_cosine_scoring(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s" % (vad_spec))
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -354,9 +359,9 @@ def eval_cosine_scoring(
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--v-file", dest="v_file", required=True)
-    parser.add_argument("--key-file", dest="key_file", default=None)
-    parser.add_argument("--enroll-file", dest="enroll_file", required=True)
+    parser.add_argument("--v-file", required=True)
+    parser.add_argument("--key-file", default=None)
+    parser.add_argument("--enroll-file", required=True)
     parser.add_argument("--test-wav-file", required=True)
 
     AR.add_class_args(parser)
@@ -364,10 +369,7 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
index 3cfde93e..c3732bd3 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
@@ -10,8 +10,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -122,7 +126,7 @@ def eval_cosine_scoring(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s" % (vad_spec))
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32")
     with torch.no_grad():
@@ -217,10 +221,7 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
index 44bdf59d..c00cf286 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
@@ -205,7 +205,7 @@ def eval_cosine_scoring(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s", vad_spec)
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -361,7 +361,6 @@ def eval_cosine_scoring(
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
         "--vad-path-prefix",
-        dest="vad_path_prefix",
         default=None,
         help=("scp file_path prefix for vad"),
     )
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
index 676575fd..4f2b82ab 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
@@ -13,8 +13,12 @@
 import pandas as pd
 from art.classifiers import PyTorchClassifier
 from art.estimators.classification import PyTorchClassifier
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -25,8 +29,9 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import \
-    ARTAttackFactory as AttackFactory
+from hyperion.torch.adv_attacks.art_attack_factory import (
+    ARTAttackFactory as AttackFactory,
+)
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
@@ -213,7 +218,7 @@ def eval_cosine_scoring(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s" % (vad_spec))
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -386,10 +391,7 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py
index da6389fb..2f5cf3da 100755
--- a/hyperion/bin/eval_xvec_logits_from_wav.py
+++ b/hyperion/bin/eval_xvec_logits_from_wav.py
@@ -11,8 +11,12 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -93,7 +97,6 @@ def eval_xvec(
     output_spec,
     vad_spec,
     write_num_frames_spec,
-    scp_sep,
     vad_path_prefix,
     model_path,
     chunk_length,
@@ -125,8 +128,8 @@ def eval_xvec(
         num_augs = 1
 
     ar_args = AR.filter_args(**kwargs)
-    logging.info("opening output stream: %s" % (output_spec))
-    with DWF.create(output_spec, scp_sep=scp_sep) as writer:
+    logging.info("opening output stream: %s", output_spec)
+    with DWF.create(output_spec) as writer:
 
         logging.info(
             "opening input stream: {} with args={}".format(input_spec, ar_args)
@@ -135,9 +138,7 @@ def eval_xvec(
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s" % (vad_spec))
-                v_reader = VRF.create(
-                    vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep
-                )
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
                 t1 = time.time()
@@ -243,7 +244,7 @@ def eval_xvec(
     parser.add_argument(
         "--write-num-frames", dest="write_num_frames_spec", default=None
     )
-    parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
+
     parser.add_argument(
         "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
     )
diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py
index 37d6a2a6..c4c4676f 100755
--- a/hyperion/bin/extract_wav2vec2xvectors.py
+++ b/hyperion/bin/extract_wav2vec2xvectors.py
@@ -12,8 +12,12 @@
 import numpy as np
 import pandas as pd
 import torchaudio.transforms as tat
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -44,6 +48,7 @@ def get_resampler(source_fs, target_fs):
     resamplers[source_fs] = resampler_f
     return resampler_f
 
+
 resamplers = {}
 
 
@@ -122,7 +127,6 @@ def extract_xvectors(
     output_spec,
     vad_spec,
     write_speech_dur,
-    scp_sep,
     vad_path_prefix,
     model_path,
     hf_chunk_length,
@@ -157,16 +161,14 @@ def extract_xvectors(
     ar_args = AR.filter_args(**kwargs)
     ar_args["wav_scale"] = 1.0
     logging.info("opening output stream: %s", output_spec)
-    with DWF.create(output_spec, scp_sep=scp_sep) as writer:
+    with DWF.create(output_spec) as writer:
 
         logging.info(f"opening input stream: {input_spec} with args={ar_args}")
         with AR(input_spec, **ar_args) as reader:
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(
-                    vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep
-                )
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
                 t1 = time.time()
@@ -283,7 +285,6 @@ def extract_xvectors(
     parser.add_argument("--input", dest="input_spec", required=True)
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument("--write-speech-dur", default=None)
-    parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
     parser.add_argument(
         "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
     )
diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py
index addabbcf..1da1ac05 100755
--- a/hyperion/bin/extract_xvectors_from_wav.py
+++ b/hyperion/bin/extract_xvectors_from_wav.py
@@ -11,8 +11,12 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -95,7 +99,6 @@ def extract_xvectors(
     output_spec,
     vad_spec,
     write_num_frames_spec,
-    scp_sep,
     vad_path_prefix,
     model_path,
     chunk_length,
@@ -129,7 +132,7 @@ def extract_xvectors(
 
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s", output_spec)
-    with DWF.create(output_spec, scp_sep=scp_sep) as writer:
+    with DWF.create(output_spec) as writer:
 
         logging.info(
             "opening input stream: {} with args={}".format(input_spec, ar_args)
@@ -138,9 +141,7 @@ def extract_xvectors(
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(
-                    vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep
-                )
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
             while not reader.eof():
                 t1 = time.time()
@@ -249,7 +250,6 @@ def extract_xvectors(
     parser.add_argument(
         "--write-num-frames", dest="write_num_frames_spec", default=None
     )
-    parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
     parser.add_argument(
         "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
     )
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
index e3d2fcbb..eaf0a5cc 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
@@ -11,8 +11,12 @@
 
 import numpy as np
 import yaml
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -78,7 +82,7 @@ def extract_xvectors(
     model = load_model(model_path, device)
 
     if write_timestamps_spec is not None:
-        time_writer = DWF.create(write_timestamps_spec, scp_sep=" ")
+        time_writer = DWF.create(write_timestamps_spec)
 
     dr_args = DRF.filter_args(**kwargs)
     logging.info("opening output stream: %s" % (output_spec))
@@ -205,10 +209,7 @@ def extract_xvectors(
     )
     parser.add_argument("--slidwin-params-path", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     MVN.add_class_args(parser, prefix="mvn")
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
index 2b1bba3b..a31bd614 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
@@ -12,8 +12,12 @@
 import numpy as np
 import pandas as pd
 import yaml
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -83,7 +87,6 @@ def extract_xvectors(
     vad_spec,
     write_timestamps_spec,
     slidwin_params_path,
-    scp_sep,
     vad_path_prefix,
     model_path,
     chunk_length,
@@ -109,7 +112,7 @@ def extract_xvectors(
     feat_snip_edges = feat_args["snip_edges"]
 
     if write_timestamps_spec is not None:
-        time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep)
+        time_writer = DWF.create(write_timestamps_spec)
 
     if aug_cfg is not None:
         augmenter = SpeechAugment.create(aug_cfg, rng=rng)
@@ -121,7 +124,7 @@ def extract_xvectors(
 
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s", output_spec)
-    with DWF.create(output_spec, scp_sep=scp_sep) as writer:
+    with DWF.create(output_spec) as writer:
 
         logging.info(
             "opening input stream: {} with args={}".format(input_spec, ar_args)
@@ -130,9 +133,7 @@ def extract_xvectors(
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(
-                    vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep
-                )
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
                 t1 = time.time()
@@ -275,7 +276,6 @@ def extract_xvectors(
     )
     parser.add_argument("--slidwin-params-path", default=None)
 
-    parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
     parser.add_argument(
         "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
     )
diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py
index a058893d..8c6f38a6 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_classif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py
@@ -12,8 +12,12 @@
 import numpy as np
 import pandas as pd
 import yaml
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -168,7 +172,7 @@ def generate_attacks(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s" % (vad_spec))
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     keys, class_names, class_ids = read_utt_list(
         list_file, class2int_file, part_idx, num_parts
@@ -329,10 +333,7 @@ def generate_attacks(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix",
-        dest="vad_path_prefix",
-        default=None,
-        help=("scp file_path prefix for vad"),
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py
index 83375cb6..fbd3a5fb 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_verif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py
@@ -12,8 +12,12 @@
 import numpy as np
 import pandas as pd
 import yaml
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 import torch.nn as nn
@@ -197,7 +201,7 @@ def generate_attacks(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s", vad_spec)
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ")
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     attack_factory = init_attack_factory(**kwargs)
     attacks_info = {}
diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py
index dccf58da..4aafa075 100755
--- a/hyperion/bin/pack_wav_rirs.py
+++ b/hyperion/bin/pack_wav_rirs.py
@@ -10,8 +10,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
@@ -20,7 +24,7 @@
 
 def pack_wav_rirs(input_path, output_spec, **kwargs):
 
-    writer = DWF.create(output_spec, scp_sep=" ", compress=False)
+    writer = DWF.create(output_spec, compress=False)
     t1 = time.time()
     with AR(input_path, wav_scale=1) as reader:
         for data in reader:
diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py
index fb6fc6c5..19420761 100644
--- a/hyperion/data_prep/data_prep.py
+++ b/hyperion/data_prep/data_prep.py
@@ -50,12 +50,12 @@ def dataset_name():
         raise NotImplementedError()
 
     @staticmethod
-    def _get_recording_duration(scp, i, n):
+    def _get_recording_duration(recordings, i, n):
         from ..io import SequentialAudioReader as AR
 
         durations = []
         fss = []
-        with AR(scp, part_idx=i, num_parts=n) as reader:
+        with AR(recordings, part_idx=i + 1, num_parts=n) as reader:
             for data in reader:
                 key, x, fs = data
                 duration = x.shape[0] / fs
@@ -69,13 +69,13 @@ def get_recording_duration(self, recording_set):
         import itertools
         from ..utils import SCPList
 
-        scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values)
+        # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values)
         futures = []
         logging.info("submitting threats...")
         with ThreadPoolExecutor(max_workers=self.num_threads) as pool:
             for i in tqdm(range(self.num_threads)):
                 future = pool.submit(
-                    DataPrep._get_recording_duration, scp, i, self.num_threads
+                    DataPrep._get_recording_duration, recording_set, i, self.num_threads
                 )
                 futures.append(future)
 
diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py
index 3919ddfa..6cf22d5f 100644
--- a/hyperion/io/ark_data_reader.py
+++ b/hyperion/io/ark_data_reader.py
@@ -4,15 +4,15 @@
 """
 
 import multiprocessing as threading
-import sys
+from typing import Union, Optional, List, Callable, Tuple
 
 import numpy as np
 
 from ..hyp_defs import float_cpu
-from ..utils.kaldi_io_funcs import (init_kaldi_input_stream, is_token, peek,
-                                    read_token)
+from ..utils.kaldi_io_funcs import init_kaldi_input_stream, is_token, peek, read_token
 from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
-from ..utils.scp_list import SCPList
+
+from ..utils import FeatureSet, PathLike
 from .data_reader import RandomAccessDataReader, SequentialDataReader
 
 
@@ -27,10 +27,9 @@ class SequentialArkDataReader(SequentialDataReader):
         part_idx: It splits the input into num_parts and writes only
                   part part_idx, where part_idx=1,...,num_parts.
         num_parts: Number of parts to split the input data.
-        split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, **kwargs):
+    def __init__(self, file_path: PathLike, **kwargs):
         super().__init__(file_path, **kwargs)
         self.f = None
         self.lock = threading.Lock()
@@ -42,7 +41,7 @@ def close(self):
             self.f.close()
             self.f = None
 
-    def _seek(self, offset):
+    def _seek(self, offset: int):
         """Moves the pointer of the input file.
 
         Args:
@@ -52,7 +51,7 @@ def _seek(self, offset):
         delta = offset - cur_pos
         self.f.seek(delta, 1)
 
-    def _open_archive(self, file_path, offset=0):
+    def _open_archive(self, file_path: PathLike, offset: int = 0):
         """Opens the current file if it is not open and moves the
            file pointer to a given position.
            Closes previous open Ark files.
@@ -69,7 +68,7 @@ def _open_archive(self, file_path, offset=0):
         if offset > 0:
             self._seek(offset)
 
-    def read_num_rows(self, num_records=0, assert_same_dim=True):
+    def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -86,7 +85,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True):
         num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int)
         return keys, num_rows
 
-    def read_dims(self, num_records=0, assert_same_dim=True):
+    def read_dims(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -120,10 +119,8 @@ class SequentialArkFileDataReader(SequentialArkDataReader):
         split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, **kwargs):
-        super(SequentialArkFileDataReader, self).__init__(
-            file_path, permissive=False, **kwargs
-        )
+    def __init__(self, file_path: PathLike, **kwargs):
+        super().__init__(file_path, permissive=False, **kwargs)
         self._open_archive(self.file_path)
         self._eof = False
         self._keys = None
@@ -151,7 +148,7 @@ def keys(self):
 
         return self._keys
 
-    def read_shapes(self, num_records=0, assert_same_dim=True):
+    def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -188,7 +185,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
 
         return keys, shapes
 
-    def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        num_records: int = 0,
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads next num_records feature matrices/vectors.
 
         Args:
@@ -206,12 +209,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
           key: List of recording names.
           data: List of feature matrices/vectors or 3D/2D numpy array.
         """
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
         keys = []
         data = []
         count = 0
@@ -264,28 +263,25 @@ class SequentialArkScriptDataReader(SequentialArkDataReader):
         part_idx: It splits the input into num_parts and writes only
                   part part_idx, where part_idx=1,...,num_parts.
         num_parts: Number of parts to split the input data.
-        split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs):
-        super(SequentialArkScriptDataReader, self).__init__(
-            file_path, permissive=False, **kwargs
-        )
-        self.scp = SCPList.load(self.file_path, sep=scp_sep)
+    def __init__(
+        self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs
+    ):
+        super().__init__(file_path, permissive=False, **kwargs)
+        self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep)
 
         if self.num_parts > 1:
-            self.scp = self.scp.split(
-                self.part_idx, self.num_parts, group_by_key=self.split_by_key
-            )
+            self.feature_set = self.feature_set.split(self.part_idx, self.num_parts)
 
         if path_prefix is not None:
-            self.scp.add_prefix_to_filepath(path_prefix)
+            self.feature_set.add_prefix_to_storage_path(path_prefix)
 
         self.cur_item = 0
 
     @property
     def keys(self):
-        return self.scp.key
+        return self.feature_set["id"]
 
     def reset(self):
         """Closes all the open Ark files and puts the read pointer pointing
@@ -295,9 +291,9 @@ def reset(self):
 
     def eof(self):
         """Returns True when all the elements in the scp have been read."""
-        return self.cur_item == len(self.scp)
+        return self.cur_item == len(self.feature_set)
 
-    def read_shapes(self, num_records=0, assert_same_dim=True):
+    def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -318,15 +314,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
         for i in range(num_records):
             if self.eof():
                 break
-            key, file_path, offset, range_spec = self.scp[self.cur_item]
-
-            row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
 
+            feature_spec = self.feature_set.iloc[self.cur_item]
+            key = feature_spec["id"]
+            offset = feature_spec["storage_byte"]
+            file_path = feature_spec["storage_path"]
             self._open_archive(file_path, offset)
             binary = init_kaldi_input_stream(self.f)
             shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True)
-
-            shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
+                shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
 
             keys.append(key)
             shapes.append(shape_i)
@@ -338,7 +337,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
 
         return keys, shapes
 
-    def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        num_records: int = 0,
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads next num_records feature matrices/vectors.
 
         Args:
@@ -359,12 +364,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
         if num_records == 0:
             num_records = len(self.scp) - self.cur_item
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
 
         keys = []
         data = []
@@ -373,7 +374,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
                 if self.eof():
                     break
 
-                key, file_path, offset, range_spec = self.scp[self.cur_item]
+                feature_spec = self.feature_set.iloc[self.cur_item]
+                key = feature_spec["id"]
+                offset = feature_spec["storage_byte"]
+                file_path = feature_spec["storage_path"]
+                if "start" in feature_spec and "num_frames" in feature_spec:
+                    range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                else:
+                    range_spec = None
 
                 row_offset_i = row_offset[i] if row_offset_is_list else row_offset
                 num_rows_i = num_rows[i] if num_rows_is_list else num_rows
@@ -417,21 +425,24 @@ class RandomAccessArkDataReader(RandomAccessDataReader):
                    features after reading them from disk.
         permissive: If True, if the data that we want to read is not in the file
                     it returns an empty matrix, if False it raises an exception.
-        scp_sep: Separator for scp files (default ' ').
     """
 
     def __init__(
-        self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" "
+        self,
+        file_path: PathLike,
+        path_prefix: Optional[PathLike] = None,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+        permissive: bool = False,
     ):
-        super(RandomAccessArkDataReader, self).__init__(
-            file_path, transform, permissive
-        )
+        super().__init__(file_path, transform, permissive)
 
-        self.scp = SCPList.load(self.file_path, sep=scp_sep)
+        self.feature_set = FeatureSet.load(self.file_path)
         if path_prefix is not None:
-            self.scp.add_prefix_to_filepath(path_prefix)
+            self.feature_set.add_prefix_to_storage_path(path_prefix)
 
-        archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True)
+        archives, archive_idx = np.unique(
+            self.feature_set["storage_path"], return_inverse=True
+        )
         self.archives = archives
         self.archive_idx = archive_idx
         self.f = [None] * len(self.archives)
@@ -448,7 +459,7 @@ def close(self):
                 f.close()
         self.f = [None] * len(self.f)
 
-    def _open_archive(self, key_idx, offset=0):
+    def _open_archive(self, key_idx: int, offset: int = 0):
         """Opens the Ark file correspoding to a given feature/matrix
            if it is not already open and moves the file pointer to the
            point where we can read that feature matrix.
@@ -473,7 +484,9 @@ def _open_archive(self, key_idx, offset=0):
 
         return f, self.locks[archive_idx]
 
-    def read_num_rows(self, keys, assert_same_dim=True):
+    def read_num_rows(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -489,7 +502,9 @@ def read_num_rows(self, keys, assert_same_dim=True):
         num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int)
         return num_rows
 
-    def read_dims(self, keys, assert_same_dim=True):
+    def read_dims(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -507,7 +522,9 @@ def read_dims(self, keys, assert_same_dim=True):
             assert np.all(dims == dims[0])
         return dims
 
-    def read_shapes(self, keys, assert_same_dim=True):
+    def read_shapes(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -525,25 +542,26 @@ def read_shapes(self, keys, assert_same_dim=True):
         shapes = []
         for key in keys:
 
-            if not (key in self.scp):
+            if not (key in self.feature_set.index):
                 if self.permissive:
                     shapes.append((0,))
                     continue
                 else:
                     raise Exception("Key %s not found" % key)
 
-            index = self.scp.get_index(key)
-            _, file_path, offset, range_spec = self.scp[index]
-
-            row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
-
+            index = self.feature_set.get_loc(key)
+            feature_spec = self.feature_set.loc[key]
+            offset = feature_spec["storage_byte"]
             f, lock = self._open_archive(index)
             with lock:
                 f.seek(offset, 0)
                 binary = init_kaldi_input_stream(f)
                 shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False)
 
-            shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
+                shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
 
             shapes.append(shape_i)
 
@@ -553,7 +571,13 @@ def read_shapes(self, keys, assert_same_dim=True):
 
         return shapes
 
-    def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        keys: Union[str, List[str], np.array],
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads the feature matrices/vectors for the recordings in keys.
 
         Args:
@@ -574,12 +598,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
         if isinstance(keys, str):
             keys = [keys]
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
         if row_offset_is_list:
             assert len(row_offset) == len(keys)
         if num_rows_is_list:
@@ -588,15 +608,20 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
         data = []
         for i, key in enumerate(keys):
 
-            if not (key in self.scp):
+            if not (key in self.feature_set.index):
                 if self.permissive:
                     data.append(np.array([], dtype=float_cpu()))
                     continue
                 else:
                     raise Exception("Key %s not found" % key)
 
-            index = self.scp.get_index(key)
-            _, file_path, offset, range_spec = self.scp[index]
+            index = self.feature_set.get_loc(key)
+            feature_spec = self.feature_set.loc[key]
+            offset = feature_spec["storage_byte"]
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+            else:
+                range_spec = None
 
             row_offset_i = row_offset[i] if row_offset_is_list else row_offset
             num_rows_i = num_rows[i] if num_rows_is_list else num_rows
diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py
index 58f5c0a1..6adf78b2 100644
--- a/hyperion/io/ark_data_writer.py
+++ b/hyperion/io/ark_data_writer.py
@@ -3,15 +3,14 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-import sys
+from typing import Union, Optional, List
 
 import numpy as np
 
 from ..hyp_defs import float_save
-from ..utils.kaldi_io_funcs import (init_kaldi_output_stream, is_token,
-                                    write_token)
+from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token
 from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
-from ..utils.scp_list import SCPList
+from ..utils import PathLike
 from .data_writer import DataWriter
 
 
@@ -28,11 +27,17 @@ class ArkDataWriter(DataWriter):
                           {auto (default), speech_feat,
                            2byte-auto, 2byte-signed-integer,
                            1byte-auto, 1byte-unsigned-integer, 1byte-0-1}.
-      scp_sep: Separator for scp files (default ' ').
+
     """
 
-    def __init__(self, archive_path, script_path=None, binary=True, **kwargs):
-        super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs)
+    def __init__(
+        self,
+        archive_path: PathLike,
+        script_path: Optional[PathLike] = None,
+        binary: bool = True,
+        **kwargs,
+    ):
+        super().__init__(archive_path, script_path, **kwargs)
         self.binary = binary
 
         if binary:
@@ -40,10 +45,9 @@ def __init__(self, archive_path, script_path=None, binary=True, **kwargs):
         else:
             self.f = open(archive_path, "w")
 
-        if script_path is not None:
-            self.f_script = open(script_path, "w")
-        else:
-            self.f_script = None
+        if script_path is not None and not self.script_is_scp:
+            row = self.script_sep.join(["id", "storage_path", "storage_byte"])
+            self.f_script.write(f"{row}\n")
 
     def __exit__(self, exc_type, exc_value, traceback):
         """Function required when exiting from contructions of type
@@ -67,7 +71,7 @@ def flush(self):
         if self.f_script is not None:
             self.f_script.flush()
 
-    def _convert_data(self, data):
+    def _convert_data(self, data: np.array):
         """Converts the feature matrix from numpy array to KaldiMatrix
         or KaldiCompressedMatrix.
         """
@@ -89,7 +93,11 @@ def _convert_data(self, data):
 
         raise ValueError("Data is not ndarray or KaldiMatrix")
 
-    def write(self, keys, data):
+    def write(
+        self,
+        keys: Union[str, List[str], np.array],
+        data: Union[np.array, List[np.array]],
+    ):
         """Writes data to file.
 
         Args:
@@ -114,9 +122,11 @@ def write(self, keys, data):
             data_i.write(self.f, self.binary)
 
             if self.f_script is not None:
-                self.f_script.write(
-                    "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos)
-                )
+                if self.script_is_scp:
+                    self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n")
+                else:
+                    row = self.script_sep.join([key_i, self.archive_path, str(pos)])
+                    self.f_script.write(f"{row}\n")
 
             if self._flush:
                 self.flush()
diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py
index 69cfa65b..1052ce8c 100644
--- a/hyperion/io/audio_reader.py
+++ b/hyperion/io/audio_reader.py
@@ -10,11 +10,13 @@
 import subprocess
 
 import numpy as np
+import pandas as pd
 import soundfile as sf
 from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from typing import Union, Optional, List
 
 from ..hyp_defs import float_cpu
-from ..utils import SCPList, SegmentList
+from ..utils import RecordingSet, SegmentSet, PathLike
 
 valid_ext = [
     ".wav",
@@ -34,7 +36,7 @@
     ".sds",
     ".sf",
     ".voc",
-    "w64",
+    ".w64",
     ".wve",
     ".xi",
 ]
@@ -44,38 +46,36 @@ class AudioReader(object):
     """Class to read audio files from wav, flac or pipe
 
     Attributes:
-         file_path:     scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object.
-         segments_path: segments file with format: segment_id file_id tbeg tend
+         recordings: RecordingSet or file path to RecordingSet
+         segments:   SegmentSet or file path to SegmentSet
          wav_scale:     multiplies signal by scale factor
     """
 
-    def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1):
-        self.file_path = file_path
-        if isinstance(file_path, SCPList):
-            self.scp = file_path
-        else:
-            self.scp = SCPList.load(file_path, sep=" ", is_wav=True)
-
-        self.segments_path = segments_path
-        if segments_path is None:
-            self.segments = None
-            self.with_segments = False
-        else:
+    def __init__(
+        self,
+        recordings: Union[RecordingSet, PathLike],
+        segments: Union[SegmentSet, PathLike, None] = None,
+        wav_scale: float = 2 ** 15 - 1,
+    ):
+        if not isinstance(recordings, RecordingSet):
+            recordings = RecordingSet.load(recordings)
+
+        self.recordings = recordings
+
+        self.with_segments = False
+        if segments is not None:
             self.with_segments = True
-            if isinstance(file_path, SegmentList):
-                self.segments = segments_path
-            else:
-                self.segments = SegmentList.load(segments_path,
-                                                 sep=" ",
-                                                 index_by_file=False)
+            if not isinstance(segments, SegmentSet):
+                segments = SegmentSet.load(segments)
 
+        self.segments = segments
         self.wav_scale = wav_scale
 
     @property
     def keys(self):
         if self.with_segments:
-            return np.asarray(self.segments["segment_id"])
-        return self.scp.key
+            return self.segments["id"].values
+        return self.recordings["id"].values
 
     def __enter__(self):
         """Function required when entering contructions of type
@@ -94,10 +94,12 @@ def __exit__(self, exc_type, exc_value, traceback):
         pass
 
     @staticmethod
-    def read_wavspecifier(wavspecifier,
-                          scale=2**15,
-                          time_offset=0,
-                          time_dur=0):
+    def read_wavspecifier(
+        wavspecifier: PathLike,
+        scale: float = 2 ** 15,
+        time_offset: float = 0.0,
+        time_dur: float = 0.0,
+    ):
         """Reads an audiospecifier (audio_file/pipe)
            It reads from pipe or from all the files that can be read
            by `libsndfile <http://www.mega-nerd.com/libsndfile/#Features>`
@@ -113,59 +115,123 @@ def read_wavspecifier(wavspecifier,
         wavspecifier = wavspecifier.strip()
         if wavspecifier[-1] == "|":
             wavspecifier = wavspecifier[:-1]
-            x, fs = AudioReader.read_pipe(wavspecifier, scale)
-            if time_offset == 0 and time_dur == 0:
-                return x, fs
-
-            start_sample = int(math.floor(time_offset * fs))
-            num_samples = int(math.floor(time_dur * fs))
-            if num_samples == 0:
-                return x[start_sample:], fs
-
-            end_sample = start_sample + num_samples
-            assert end_sample <= len(x)
-            return x[start_sample:end_sample], fs
+            return AudioReader.read_pipe(wavspecifier, scale, time_offset, time_dur)
 
         ext = os.path.splitext(wavspecifier)[1]
         if ext in valid_ext:
-            if time_offset == 0 and time_dur == 0:
-                x, fs = sf.read(wavspecifier, dtype=float_cpu())
-                x *= scale
-                return x, fs
-
-            with sf.SoundFile(wavspecifier, "r") as f:
-                fs = f.samplerate
-                start_sample = int(math.floor(time_offset * fs))
-                num_samples = int(math.floor(time_dur * fs))
-                f.seek(start_sample)
-                if num_samples > 0:
-                    x = scale * f.read(num_samples, dtype=float_cpu())
-                else:
-                    x = scale * f.read(dtype=float_cpu())
-                return x, fs
+            return AudioReader.read_file(wavspecifier, scale, time_offset, time_dur)
 
         raise Exception("Unknown format for %s" % (wavspecifier))
 
     @staticmethod
-    def read_pipe(wavspecifier, scale=2**15):
+    def read_pipe(
+        wavspecifier: PathLike,
+        scale: float = 2 ** 15,
+        time_offset: float = 0,
+        time_dur: float = 0,
+    ):
         """Reads wave file from a pipe
         Args:
           wavspecifier: Shell command with pipe output
           scale:        Multiplies signal by scale factor
         """
-        # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-        proc = subprocess.Popen(wavspecifier,
-                                shell=True,
-                                stdout=subprocess.PIPE)
+        if wavspecifier[-1] == "|":
+            wavspecifier = wavspecifier[:-1]
+
+        proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE)
         pipe = proc.communicate()[0]
         if proc.returncode != 0:
-            raise Exception("Wave read pipe command %s returned code %d" %
-                            (wavspecifier, proc.returncode))
+            raise Exception(
+                "Wave read pipe command %s returned code %d"
+                % (wavspecifier, proc.returncode)
+            )
         x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu())
         x *= scale
-        return x, fs
+        if time_offset == 0 and time_dur == 0:
+            return x, fs
+
+        start_sample = int(math.floor(time_offset * fs))
+        num_samples = int(math.floor(time_dur * fs))
+        if num_samples == 0:
+            return x[start_sample:], fs
+
+        end_sample = start_sample + num_samples
+        assert end_sample <= len(x)
+        return x[start_sample:end_sample], fs
+
+    @staticmethod
+    def read_file_sf(
+        wavspecifier: PathLike,
+        scale: float = 2 ** 15,
+        time_offset: float = 0,
+        time_dur: float = 0,
+    ):
+        if time_offset == 0 and time_dur == 0:
+            x, fs = sf.read(wavspecifier, dtype=float_cpu())
+            x *= scale
+            return x, fs
+
+        with sf.SoundFile(wavspecifier, "r") as f:
+            fs = f.samplerate
+            start_sample = int(math.floor(time_offset * fs))
+            num_samples = int(math.floor(time_dur * fs))
+            f.seek(start_sample)
+            if num_samples > 0:
+                x = scale * f.read(num_samples, dtype=float_cpu())
+            else:
+                x = scale * f.read(dtype=float_cpu())
+
+            return x, fs
+
+    @staticmethod
+    def read_file(
+        wavspecifier: PathLike,
+        scale: float = 2 ** 15,
+        time_offset: float = 0,
+        time_dur: float = 0,
+    ):
+        try:
+            return AudioReader.read_file_sf(wavspecifier, scale, time_offset, time_dur)
+        except:
+            # some files produce error in the fseek after reading the data,
+            # this seems an issue from pysoundfile or soundfile lib itself
+            # we try to read from
+            # time-offset to the end of the file, and remove the extra frames later,
+            # this solves the problem in most cases
+            logging.info(
+                (
+                    "error-1 reading keys=%s offset=%f duration=%f"
+                    "retrying reading until end-of-file ..."
+                ),
+                wavspecifier,
+                time_offset,
+                time_dur,
+            )
+            try:
+                x, fs = AudioReader.read_file_sf(wavspecifier, scale, time_offset)
+                num_samples = int(math.floor(time_dur * fs))
+                x = x[:num_samples]
+                return x, fs
+            except:
+                logging.info(
+                    (
+                        "error-2 reading keys=%s offset=%f duration=%f"
+                        "retrying reading full file ..."
+                    ),
+                    wavspecifier,
+                    time_offset,
+                    time_dur,
+                )
+
+                x, fs = AudioReader.read_file_sf(wavspecifier, scale)
+                start_sample = int(math.floor(time_offset * fs))
+                num_samples = int(math.floor(time_dur * fs))
+                x = x[start_sample : start_sample + num_samples]
+                return x, fs
 
-    def _read_segment(self, segment, time_offset=0, time_dur=0):
+    def _read_segment(
+        self, segment: pd.Series, time_offset: float = 0, time_dur: float = 0
+    ):
         """Reads a wave segment
 
         Args:
@@ -173,28 +239,11 @@ def _read_segment(self, segment, time_offset=0, time_dur=0):
         Returns:
           Wave, sampling frequency
         """
-        file_id = segment["file_id"]
-        t_beg = segment["tbeg"] + time_offset
-        t_end = segment["tend"]
-        if time_dur > 0:
-            t_end_new = t_beg + time_dur
-            assert t_end_new <= t_end
-            t_end = t_end_new
-
-        file_path, _, _ = self.scp[file_id]
-        x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale)
-        num_samples_i = len(x_i)
-        s_beg = int(t_beg * fs_i)
-        if s_beg >= num_samples_i:
-            raise Exception(
-                "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)"
-                % (file_id, t_beg, s_beg, file_id, num_samples_i))
-
-        s_end = int(t_end * fs_i)
-        if s_end > num_samples_i or t_end < 0:
-            s_end = num_samples_i
-
-        x_i = x_i[s_beg:s_end]
+        recording_id = segment["recording_id"]
+        t_start = segment["start"] + time_offset
+        t_dur = segment["duration"]
+        storage_path = self.recordings.loc[recording_id, "storage_path"]
+        x_i, fs_i = self.read_wavspecifier(storage_path, self.wav_scale, t_start, t_dur)
         return x_i, fs_i
 
     def read(self):
@@ -202,27 +251,23 @@ def read(self):
 
 
 class SequentialAudioReader(AudioReader):
-
     def __init__(
         self,
-        file_path,
-        segments_path=None,
-        wav_scale=2**15 - 1,
-        part_idx=1,
-        num_parts=1,
+        recordings: Union[RecordingSet, PathLike],
+        segments: Union[SegmentSet, PathLike, None] = None,
+        wav_scale: float = 2 ** 15 - 1,
+        part_idx: int = 1,
+        num_parts: int = 1,
     ):
-        super().__init__(file_path, segments_path, wav_scale=wav_scale)
+        super().__init__(recordings, segments, wav_scale=wav_scale)
         self.cur_item = 0
         self.part_idx = part_idx
         self.num_parts = num_parts
         if self.num_parts > 1:
             if self.with_segments:
-                self.segments = self.segments.split(self.part_idx,
-                                                    self.num_parts)
+                self.segments = self.segments.split(self.part_idx, self.num_parts)
             else:
-                self.scp = self.scp.split(self.part_idx,
-                                          self.num_parts,
-                                          group_by_key=False)
+                self.recordings = self.recordings.split(self.part_idx, self.num_parts)
 
     def __iter__(self):
         """Needed to build an iterator, e.g.:
@@ -262,9 +307,9 @@ def eof(self):
         """
         if self.with_segments:
             return self.cur_item == len(self.segments)
-        return self.cur_item == len(self.scp)
+        return self.cur_item == len(self.recordings)
 
-    def read(self, num_records=0, time_offset=0, time_durs=0):
+    def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = 0):
         """Reads next num_records audio files
 
         Args:
@@ -281,7 +326,7 @@ def read(self, num_records=0, time_offset=0, time_durs=0):
             if self.with_segments:
                 num_records = len(self.segments) - self.cur_item
             else:
-                num_records = len(self.scp) - self.cur_item
+                num_records = len(self.recordings) - self.cur_item
 
         offset_is_list = isinstance(time_offset, (list, np.ndarray))
         dur_is_list = isinstance(time_durs, (list, np.ndarray))
@@ -297,13 +342,14 @@ def read(self, num_records=0, time_offset=0, time_durs=0):
             dur_i = time_durs[i] if dur_is_list else time_durs
 
             if self.with_segments:
-                segment = self.segments[self.cur_item]
-                key = segment["segment_id"]
+                segment = self.segments.iloc[self.cur_item]
+                key = segment["id"]
                 x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
             else:
-                key, file_path, _, _ = self.scp[self.cur_item]
-                x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale,
-                                                   offset_i, dur_i)
+                key, file_path = self.recordings.iloc[self.cur_item]
+                x_i, fs_i = self.read_wavspecifier(
+                    file_path, self.wav_scale, offset_i, dur_i
+                )
 
             keys.append(key)
             data.append(x_i)
@@ -318,14 +364,14 @@ def filter_args(**kwargs):
         return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
-    def add_class_args(parser, prefix=None):
+    def add_class_args(parser, prefix: Optional[str] = None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
         parser.add_argument(
             "--wav-scale",
-            default=2**15 - 1,
+            default=2 ** 15 - 1,
             type=float,
             help=("multiplicative factor for waveform"),
         )
@@ -334,38 +380,50 @@ def add_class_args(parser, prefix=None):
                 "--part-idx",
                 type=int,
                 default=1,
-                help=("splits the list of files into num-parts and "
-                      "processes part-idx"),
+                help=(
+                    "splits the list of files into num-parts and " "processes part-idx"
+                ),
             )
             parser.add_argument(
                 "--num-parts",
                 type=int,
                 default=1,
-                help=("splits the list of files into num-parts and "
-                      "processes part-idx"),
+                help=(
+                    "splits the list of files into num-parts and " "processes part-idx"
+                ),
             )
         except:
             pass
 
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix,
-                action=ActionParser(parser=parser),
+                "--" + prefix, action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
 
 
 class RandomAccessAudioReader(AudioReader):
+    def __init__(
+        self,
+        recordings: Union[RecordingSet, PathLike],
+        segments: Union[SegmentSet, PathLike, None] = None,
+        wav_scale: float = 2 ** 15 - 1,
+    ):
+        super().__init__(recordings, segments, wav_scale)
 
-    def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1):
-        super().__init__(file_path, segments_path, wav_scale)
-
-    def _read(self, keys, time_offset=0, time_durs=0):
+    def read(
+        self,
+        keys: Union[str, List, np.array],
+        time_offset: float = 0,
+        time_durs: float = 0,
+    ):
         """Reads the waveforms  for the recordings in keys.
 
         Args:
           keys: List of recording/segment_ids names.
+          time_offset: float or float list with time-offsets
+          time_durs: float or float list with durations 
 
         Returns:
           data: List of waveforms
@@ -384,93 +442,92 @@ def _read(self, keys, time_offset=0, time_durs=0):
             dur_i = time_durs[i] if dur_is_list else time_durs
 
             if self.with_segments:
-                if not (key in self.segments):
+                if not (key in self.segments.index):
                     raise Exception("Key %s not found" % key)
 
-                segment = self.segments[key]
+                segment = self.segments.loc[key]
                 x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
             else:
-                if not (key in self.scp):
+                if not (key in self.recordings.index):
                     raise Exception("Key %s not found" % key)
 
-                file_path, _, _ = self.scp[key]
-                x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale,
-                                                   offset_i, dur_i)
+                file_path = self.recordings.loc[key, "storage_path"]
+                x_i, fs_i = self.read_wavspecifier(
+                    file_path, self.wav_scale, offset_i, dur_i
+                )
 
             data.append(x_i)
             fs.append(fs_i)
 
         return data, fs
 
-    def read(self, keys, time_offset=0, time_durs=0):
-        """Reads the waveforms  for the recordings in keys.
-
-        Args:
-          keys: List of recording/segment_ids names.
-
-        Returns:
-          data: List of waveforms
-          fs: List of sampling freq.
-        """
-        try:
-            x, fs = self._read(keys,
-                               time_offset=time_offset,
-                               time_durs=time_durs)
-        except:
-            if isinstance(keys, str):
-                keys = [keys]
-
-            if not isinstance(time_offset, (list, np.ndarray)):
-                time_offset = [time_offset] * len(keys)
-            if not isinstance(time_durs, (list, np.ndarray)):
-                time_durs = [time_durs] * len(keys)
-
-            try:
-                # some files produce error in the fseek after reading the data,
-                # this seems an issue from pysoundfile or soundfile lib itself
-                # we try to read from
-                # time-offset to the end of the file, and remove the extra frames later,
-                # this solves the problem in most cases
-                logging.info(("error-1 reading at keys={} offset={} "
-                              "retrying reading until end-of-file ...").format(
-                                  keys, time_offset))
-                x, fs = self._read(keys, time_offset=time_offset)
-                for i in range(len(x)):
-                    end_sample = int(time_durs[i] * fs[i])
-                    x[i] = x[i][:end_sample]
-            except:
-                # try to read the full file
-                logging.info(("error-2 reading at key={}, "
-                              "retrying reading full file ...").format(keys))
-                x, fs = self._read(keys)
-                for i in range(len(x)):
-                    start_sample = int(time_offset[i] * fs[i])
-                    end_sample = start_sample + int(time_durs[i] * fs[i])
-                    x[i] = x[i][start_sample:end_sample]
-
-        return x, fs
+    # def read(self, keys, time_offset=0, time_durs=0):
+    #     """Reads the waveforms  for the recordings in keys.
+
+    #     Args:
+    #       keys: List of recording/segment_ids names.
+
+    #     Returns:
+    #       data: List of waveforms
+    #       fs: List of sampling freq.
+    #     """
+    #     try:
+    #         x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs)
+    #     except:
+    #         if isinstance(keys, str):
+    #             keys = [keys]
+
+    #         if not isinstance(time_offset, (list, np.ndarray)):
+    #             time_offset = [time_offset] * len(keys)
+    #         if not isinstance(time_durs, (list, np.ndarray)):
+    #             time_durs = [time_durs] * len(keys)
+
+    #         try:
+    #             logging.info(
+    #                 (
+    #                     "error-1 reading at keys={} offset={} "
+    #                     "retrying reading until end-of-file ..."
+    #                 ).format(keys, time_offset)
+    #             )
+    #             x, fs = self._read(keys, time_offset=time_offset)
+    #             for i in range(len(x)):
+    #                 end_sample = int(time_durs[i] * fs[i])
+    #                 x[i] = x[i][:end_sample]
+    #         except:
+    #             # try to read the full file
+    #             logging.info(
+    #                 (
+    #                     "error-2 reading at key={}, " "retrying reading full file ..."
+    #                 ).format(keys)
+    #             )
+    #             x, fs = self._read(keys)
+    #             for i in range(len(x)):
+    #                 start_sample = int(time_offset[i] * fs[i])
+    #                 end_sample = start_sample + int(time_durs[i] * fs[i])
+    #                 x[i] = x[i][start_sample:end_sample]
+
+    #     return x, fs
 
     @staticmethod
     def filter_args(**kwargs):
-        valid_args = ("wav_scale", )
+        valid_args = ("wav_scale",)
         return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
-    def add_class_args(parser, prefix=None):
+    def add_class_args(parser, prefix: Optional[str] = None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
         parser.add_argument(
             "--wav-scale",
-            default=2**15 - 1,
+            default=2 ** 15 - 1,
             type=float,
             help=("multiplicative factor for waveform"),
         )
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix,
-                action=ActionParser(parser=parser),
+                "--" + prefix, action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py
index f98a3251..e416c209 100644
--- a/hyperion/io/audio_writer.py
+++ b/hyperion/io/audio_writer.py
@@ -8,12 +8,16 @@
 
 import numpy as np
 import soundfile as sf
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from typing import Union, Optional, List
+from pathlib import Path
 
 from ..hyp_defs import float_cpu
 from ..utils.kaldi_io_funcs import is_token
-from ..utils.scp_list import SCPList
+from ..utils import PathLike
 from .audio_reader import valid_ext
 
+
 subtype_to_npdtype = {
     "PCM_32": "int32",
     "ALAW": "int16",
@@ -37,25 +41,23 @@ class AudioWriter(object):
 
     Attributes:
       output_path: output data file path.
-      script_path: optional output scp file.
+      script_path: optional output kaldi .scp or pandas .csv file.
       audio_format:   audio file format
       audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...],
                if None, it uses soundfile defaults (recommended)
-      scp_sep: Separator for scp files (default ' ').
     """
 
     def __init__(
         self,
-        output_path,
-        script_path=None,
-        audio_format="wav",
-        audio_subtype=None,
-        scp_sep=" ",
+        output_path: PathLike,
+        script_path: Optional[PathLike] = None,
+        audio_format: str = "wav",
+        audio_subtype: Optional[str] = None,
     ):
-        self.output_path = output_path
-        self.script_path = script_path
+        self.output_path = Path(output_path)
+        self.script_path = Path(script_path) if script_path is not None else None
         self.audio_format = audio_format
-        self.scp_sep = scp_sep
+        self.output_path.mkdir(exist_ok=True, parents=True)
 
         assert "." + self.audio_format in valid_ext
         if audio_subtype is None:
@@ -64,16 +66,23 @@ def __init__(
             self.subtype = audio_subtype
             assert sf.check_format(self.audio_format, self.subtype)
 
-        if not os.path.exists(output_path):
-            try:
-                os.makedirs(output_path)
-            except FileExistsError:
-                pass
-
+        self.script_is_scp = False
+        self.script_sep = None
+        self.f_script = None
         if script_path is not None:
-            self.f_script = open(script_path, "w")
-        else:
-            self.f_script = None
+            self.script_path.parent.mkdir(exist_ok=True, parents=True)
+            script_ext = self.script_path.suffix
+            self.script_is_scp = script_ext == ".scp"
+
+            if self.script_is_scp:
+                self.f_script = open(self.script_path, "w")
+            else:
+                self.script_sep = "," if script_ext == ".csv" else "\t"
+                self.f_script = open(self.script_path, "w", "utf-8")
+                row = self.script_sep.join(
+                    ["id", "storage_path", "duration", "sample_freq"]
+                )
+                self.f_script.write(f"{row}\n")
 
     def __enter__(self):
         """Function required when entering contructions of type
@@ -96,7 +105,12 @@ def close(self):
         if self.f_script is not None:
             self.f_script.close()
 
-    def write(self, keys, data, fs):
+    def write(
+        self,
+        keys: Union[str, List[str], np.array],
+        data: Union[np.array, List[np.array]],
+        fs: Union[int, float, List[int], List[float], np.array],
+    ):
         """Writes waveform to audio file.
 
         Args:
@@ -120,14 +134,21 @@ def write(self, keys, data, fs):
                 file_basename,
                 self.audio_format,
             )
-            fs_i = fs[i] if fs_is_list else fs
+            fs_i = int(fs[i]) if fs_is_list else fs
             data_i = data[i].astype(dtype, copy=False)
             sf.write(output_file, data_i, fs_i, subtype=self.subtype)
 
             output_files.append(output_file)
 
             if self.f_script is not None:
-                self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file))
+                if self.script_is_scp:
+                    self.f_script.write(f"{key_i} {output_file}\n")
+                else:
+                    duration_i = data_i.shape[-1] / fs_i
+                    row = self.script_sep.join(
+                        [key_i, output_file, str(duration_i), str(fs_i)]
+                    )
+                    self.f_script.write(f"{row}\n")
                 self.f_script.flush()
 
         return output_files
@@ -146,29 +167,30 @@ def filter_args(**kwargs):
 
     @staticmethod
     def add_class_args(parser, prefix=None):
-        if prefix is None:
-            p1 = "--"
-        else:
-            p1 = "--" + prefix + "."
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
 
         # parser.add_argument(p1+'output-wav-scale', default=1, type=float,
         #                      help=('scale to divide the waveform before writing'))
 
         parser.add_argument(
-            p1 + "output-audio-format",
+            "--output-audio-format",
             default="flac",
             choices=["flac", "ogg", "wav"],
             help=("ouput audio format"),
         )
 
         parser.add_argument(
-            p1 + "output-audio-subtype",
+            "--output-audio-subtype",
             default=None,
             choices=["pcm_16", "pcm_24", "float", "double", "vorbis"],
             help=("coding format for audio file"),
         )
 
-        # parser.add_argument(p1+'output-fs', default=16000, type=int,
-        #                      help=('output sample frequency'))
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix, action=ActionParser(parser=parser),
+            )
 
     add_argparse_args = add_class_args
diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py
index e4e64777..82e2a0c5 100644
--- a/hyperion/io/bin_vad_reader.py
+++ b/hyperion/io/bin_vad_reader.py
@@ -18,13 +18,12 @@ def __init__(
         self,
         rspecifier,
         path_prefix=None,
-        scp_sep=" ",
         frame_length=25,
         frame_shift=10,
         snip_edges=False,
     ):
 
-        r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep)
+        r = DRF.create(rspecifier, path_prefix)
         super().__init__(r.file_path, r.permissive)
         self.r = r
         self.frame_shift = frame_shift
diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py
index bbefa62d..73c120b5 100644
--- a/hyperion/io/data_reader.py
+++ b/hyperion/io/data_reader.py
@@ -6,18 +6,24 @@
 import logging
 import multiprocessing
 from abc import ABCMeta, abstractmethod
+from typing import Union, Optional, List, Callable, Tuple
 
 import numpy as np
 
 from ..hyp_defs import float_cpu
 from ..np.transforms import TransformList
-from ..utils.scp_list import SCPList
+from ..utils import PathLike
 
 
 class DataReader(object):
     __metaclass__ = ABCMeta
 
-    def __init__(self, file_path, transform=None, permissive=False):
+    def __init__(
+        self,
+        file_path: PathLike,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+        permissive: bool = False,
+    ):
         """Abstract base class to read Ark or hdf5 feature files.
 
         Attributes:
@@ -57,7 +63,7 @@ def close(self):
         pass
 
     @staticmethod
-    def _squeeze(data, permissive=False):
+    def _squeeze(data: np.array, permissive: bool = False):
         """Converts list of matrices to 3D numpy array or
            list of vectors to 2D numpy array.
 
@@ -121,7 +127,7 @@ def _combine_ranges(read_range, row_offset, num_rows):
         return row_offset, num_rows
 
     @staticmethod
-    def _apply_range_to_shape(shape, row_offset, num_rows):
+    def _apply_range_to_shape(shape: Tuple[int, int], row_offset: int, num_rows: int):
         """Modifies shape given the user defined row_offset and num_rows to read.
            If we are reading a matrix of shape (100,4) and row_offset=10, num_rows=20,
            it returns (20,4).
@@ -158,25 +164,22 @@ class SequentialDataReader(DataReader):
         part_idx: It splits the input into num_parts and writes only
                   part part_idx, where part_idx=1,...,num_parts.
         num_parts: Number of parts to split the input data.
-        split_by_key: If True, all the elements with the same key go to the same part.
     """
 
     __metaclass__ = ABCMeta
 
     def __init__(
         self,
-        file_path,
-        transform=None,
-        permissive=False,
-        part_idx=1,
-        num_parts=1,
-        split_by_key=False,
+        file_path: PathLike,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+        permissive: bool = False,
+        part_idx: int = 1,
+        num_parts: int = 1,
     ):
         super().__init__(file_path, transform, permissive)
         self.lock = multiprocessing.Lock()
         self.part_idx = part_idx
         self.num_parts = num_parts
-        self.split_by_key = split_by_key
 
     def __iter__(self):
         """Needed to build an iterator, e.g.:
@@ -218,7 +221,7 @@ def eof(self):
         return False
 
     @abstractmethod
-    def read_num_rows(self, num_records=0, assert_same_dim=True):
+    def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -234,7 +237,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read_dims(self, num_records=0, assert_same_dim=True):
+    def read_dims(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -250,7 +253,7 @@ def read_dims(self, num_records=0, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read_shapes(self, num_records=0, assert_same_dim=True):
+    def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -266,7 +269,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read(self, num_records=0, squeeze=False, offset=0, num_rows=0):
+    def read(
+        self,
+        num_records: int = 0,
+        squeeze: bool = False,
+        offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads next num_records feature matrices/vectors.
 
         Args:
@@ -290,7 +299,12 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0):
 class RandomAccessDataReader(DataReader):
     __metaclass__ = ABCMeta
 
-    def __init__(self, file_path, transform=None, permissive=False):
+    def __init__(
+        self,
+        file_path: PathLike,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+        permissive: bool = False,
+    ):
         """Abstract base class to read Ark or hdf5 feature files in
            random order.
 
@@ -305,7 +319,7 @@ def __init__(self, file_path, transform=None, permissive=False):
         super().__init__(file_path, transform, permissive)
 
     @abstractmethod
-    def read_num_rows(self, keys=None, assert_same_dim=True):
+    def read_num_rows(self, keys: Union[str, List[str]], assert_same_dim: bool = True):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -320,7 +334,7 @@ def read_num_rows(self, keys=None, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read_dims(self, keys=None, assert_same_dim=True):
+    def read_dims(self, keys: Union[str, List[str]], assert_same_dim: bool = True):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -335,7 +349,7 @@ def read_dims(self, keys=None, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read_shapes(self, keys=None, assert_same_dim=True):
+    def read_shapes(self, keys: Union[str, List[str]], assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -350,7 +364,13 @@ def read_shapes(self, keys=None, assert_same_dim=True):
         pass
 
     @abstractmethod
-    def read(self, keys, squeeze=False, offset=0, num_rows=0):
+    def read(
+        self,
+        keys: Union[str, List[str]],
+        squeeze: bool = False,
+        offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads the feature matrices/vectors for the recordings in keys.
 
         Args:
diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py
index 7868baae..b56e8c27 100644
--- a/hyperion/io/data_rw_factory.py
+++ b/hyperion/io/data_rw_factory.py
@@ -4,10 +4,13 @@
 """
 
 import logging
+from typing import Union, Optional, List, Callable, Tuple
 
 from jsonargparse import ActionParser, ArgumentParser
+import numpy as np
 
 from ..utils.kaldi_matrix import compression_methods
+from ..utils import PathLike
 from .ark_data_reader import RandomAccessArkDataReader as RADR
 from .ark_data_reader import SequentialArkFileDataReader as SAFDR
 from .ark_data_reader import SequentialArkScriptDataReader as SASDR
@@ -17,8 +20,7 @@
 from .h5_data_reader import SequentialH5FileDataReader as SH5FDR
 from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR
 from .h5_data_writer import H5DataWriter as H5DW
-from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier,
-                            WSpecType)
+from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType
 
 
 class DataWriterFactory(object):
@@ -27,7 +29,9 @@ class DataWriterFactory(object):
     """
 
     @staticmethod
-    def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "):
+    def create(
+        wspecifier: PathLike, compress: bool = False, compression_method: str = "auto"
+    ):
         if isinstance(wspecifier, str):
             wspecifier = WSpecifier.create(wspecifier)
 
@@ -43,7 +47,6 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "):
                     flush=wspecifier.flush,
                     compress=compress,
                     compression_method=compression_method,
-                    scp_sep=scp_sep,
                 )
             else:
                 return ADW(
@@ -53,21 +56,19 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "):
                     flush=wspecifier.flush,
                     compress=compress,
                     compression_method=compression_method,
-                    scp_sep=scp_sep,
                 )
 
     @staticmethod
     def filter_args(**kwargs):
-        valid_args = ("scp_sep", "compress", "compression_method")
+        valid_args = ("compress", "compression_method")
         return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
-    def add_class_args(parser, prefix=None):
+    def add_class_args(parser, prefix: Optional[PathLike] = None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
-        parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
         parser.add_argument("--compress", default=False, action="store_true")
         parser.add_argument(
             "--compression-method", default="auto", choices=compression_methods
@@ -80,7 +81,7 @@ def add_class_args(parser, prefix=None):
 
 class SequentialDataReaderFactory(object):
     @staticmethod
-    def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs):
+    def create(rspecifier: PathLike, path_prefix: Optional[PathLike] = None, **kwargs):
 
         if isinstance(rspecifier, str):
             rspecifier = RSpecifier.create(rspecifier)
@@ -92,27 +93,21 @@ def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs):
                 return SAFDR(rspecifier.archive, **kwargs)
         else:
             if rspecifier.archive_type == ArchiveType.H5:
-                return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs)
+                return SH5SDR(rspecifier.script, path_prefix, **kwargs)
             else:
-                return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs)
+                return SASDR(rspecifier.script, path_prefix, **kwargs)
 
     @staticmethod
     def filter_args(**kwargs):
-        valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts")
+        valid_args = ("path_prefix", "part_idx", "num_parts")
         return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
-    def add_class_args(parser, prefix=None):
+    def add_class_args(parser, prefix: Optional[PathLike] = None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
-        try:
-            parser.add_argument(
-                "--scp-sep", default=" ", help=("scp file field separator")
-            )
-        except:
-            pass
         parser.add_argument(
             "--path-prefix", default=None, help=("scp file_path prefix")
         )
@@ -139,7 +134,11 @@ def add_class_args(parser, prefix=None):
 
 class RandomAccessDataReaderFactory(object):
     @staticmethod
-    def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "):
+    def create(
+        rspecifier: PathLike,
+        path_prefix: Optional[PathLike] = None,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+    ):
         if isinstance(rspecifier, str):
             rspecifier = RSpecifier.create(rspecifier)
         logging.debug(rspecifier.__dict__)
@@ -162,7 +161,6 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "):
                     path_prefix,
                     transform=transform,
                     permissive=rspecifier.permissive,
-                    scp_sep=scp_sep,
                 )
             else:
                 return RADR(
@@ -170,26 +168,19 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "):
                     path_prefix,
                     transform=transform,
                     permissive=rspecifier.permissive,
-                    scp_sep=scp_sep,
                 )
 
     @staticmethod
     def filter_args(**kwargs):
-        valid_args = ("scp_sep", "path_prefix")
+        valid_args = "path_prefix"
         return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
-    def add_class_args(parser, prefix=None):
+    def add_class_args(parser, prefix: Optional[PathLike] = None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
-        try:
-            parser.add_argument(
-                "--scp-sep", default=" ", help=("scp file field separator")
-            )
-        except:
-            pass
         parser.add_argument(
             "--path-prefix", default=None, help=("scp file_path prefix")
         )
diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py
index cf2bb4f9..8adbf87a 100644
--- a/hyperion/io/data_writer.py
+++ b/hyperion/io/data_writer.py
@@ -5,9 +5,13 @@
 
 import os
 from abc import ABCMeta, abstractmethod
+from typing import Union, Optional, List
+from pathlib import Path
+import numpy as np
+from ..utils import PathLike
 
 
-class DataWriter(object):
+class DataWriter:
     """Abstract base class to write Ark or hdf5 feature files.
 
     Attributes:
@@ -19,35 +23,42 @@ class DataWriter(object):
                           {auto (default), speech_feat,
                            2byte-auto, 2byte-signed-integer,
                            1byte-auto, 1byte-unsigned-integer, 1byte-0-1}.
-      scp_sep: Separator for scp files (default ' ').
     """
 
     __metaclass__ = ABCMeta
 
     def __init__(
         self,
-        archive_path,
-        script_path=None,
-        flush=False,
-        compress=False,
-        compression_method="auto",
-        scp_sep=" ",
+        archive_path: PathLike,
+        script_path: Optional[PathLike] = None,
+        flush: bool = False,
+        compress: bool = False,
+        compression_method: str = "auto",
     ):
-        self.archive_path = archive_path
-        self.script_path = script_path
+        self.archive_path = Path(archive_path)
+        self.script_path = Path(script_path) if script_path is not None else None
         self._flush = flush
         self.compress = compress
         self.compression_method = compression_method
-        self.scp_sep = scp_sep
 
-        archive_dir = os.path.dirname(archive_path)
-        if not os.path.exists(archive_dir):
-            os.makedirs(archive_dir)
+        archive_dir = self.archive_path.parent
+        archive_dir.mkdir(exist_ok=True, parents=True)
 
+        self.script_is_scp = False
+        self.script_sep = None
+        self.f_script = None
         if script_path is not None:
-            script_dir = os.path.dirname(script_path)
-            if not os.path.exists(script_dir):
-                os.makedirs(script_dir)
+            self.script_path.parent.mkdir(exist_ok=True, parents=True)
+            script_ext = self.script_path.suffix
+            self.script_is_scp = script_ext == ".scp"
+
+            if self.script_is_scp:
+                self.f_script = open(self.script_path, "w")
+            else:
+                self.script_sep = "," if script_ext == ".csv" else "\t"
+                self.f_script = open(self.script_path, "w", "utf-8")
+                row = self.script_sep.join(["id", "storage_path"])
+                self.f_script.write(f"{row}\n")
 
     def __enter__(self):
         """Function required when entering contructions of type
@@ -77,7 +88,11 @@ def flush(self):
         pass
 
     @abstractmethod
-    def write(self, key, data):
+    def write(
+        self,
+        keys: Union[str, List[str], np.array],
+        data: Union[np.array, List[np.array]],
+    ):
         """Writes data to file.
 
         Args:
diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py
index dfefbec3..d509504d 100644
--- a/hyperion/io/h5_data_reader.py
+++ b/hyperion/io/h5_data_reader.py
@@ -6,8 +6,8 @@
 """
 
 import multiprocessing
-import sys
 import time
+from typing import Union, Optional, List, Callable, Tuple
 
 import h5py
 import numpy as np
@@ -16,11 +16,18 @@
 from ..utils.kaldi_io_funcs import is_token
 from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
 from ..utils.list_utils import split_list, split_list_group_by_key
-from ..utils.scp_list import SCPList
+
+# from ..utils.scp_list import SCPList
+from ..utils import FeatureSet, PathLike
 from .data_reader import RandomAccessDataReader, SequentialDataReader
 
 
-def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None):
+def _read_h5_data(
+    dset,
+    row_offset: int = 0,
+    num_rows: int = 0,
+    transform: Optional[Callable[[np.array], np.array]] = None,
+):
     """Auxiliary function to read the feature matrix from hdf5 dataset.
        It decompresses the data if it was compressed.
 
@@ -74,7 +81,7 @@ class SequentialH5DataReader(SequentialDataReader):
         split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, **kwargs):
+    def __init__(self, file_path: PathLike, **kwargs):
         super().__init__(file_path, **kwargs)
         self.f = None
         self.cur_file = None
@@ -86,7 +93,7 @@ def close(self):
             self.f.close()
             self.f = None
 
-    def _open_archive(self, file_path):
+    def _open_archive(self, file_path: PathLike):
         """Opens the hdf5 file where the next matrix/vector is
         if it is not open.
         If there was another hdf5 file open, it closes it.
@@ -96,7 +103,7 @@ def _open_archive(self, file_path):
             self.cur_file = file_path
             self.f = h5py.File(file_path, "r")
 
-    def read_num_rows(self, num_records=0, assert_same_dim=True):
+    def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -113,7 +120,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True):
         num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int)
         return keys, num_rows
 
-    def read_dims(self, num_records=0, assert_same_dim=True):
+    def read_dims(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -147,7 +154,7 @@ class SequentialH5FileDataReader(SequentialH5DataReader):
         split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, **kwargs):
+    def __init__(self, file_path: PathLike, **kwargs):
         super().__init__(file_path, permissive=False, **kwargs)
         self._open_archive(self.file_path)
         self._keys = list(self.f.keys())
@@ -172,7 +179,7 @@ def eof(self):
         """Returns True when it reaches the end of the ark file."""
         return self.cur_item == len(self._keys)
 
-    def read_shapes(self, num_records=0, assert_same_dim=True):
+    def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -204,7 +211,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
 
         return keys, shapes
 
-    def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        num_records: int = 0,
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads next num_records feature matrices/vectors.
 
         Args:
@@ -225,12 +238,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
         if num_records == 0:
             num_records = len(self._keys) - self.cur_item
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
         keys = []
         data = []
         with self.lock:
@@ -268,7 +277,6 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader):
                      the scp file. This is useful when data
                      is read from a different directory of that
                      it was created.
-        scp_sep: Separator for scp files (default ' ').
         transform: TransformList object, applies a transformation to the
                    features after reading them from disk.
         part_idx: It splits the input into num_parts and writes only
@@ -277,20 +285,20 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader):
         split_by_key: If True, all the elements with the same key go to the same part.
     """
 
-    def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs):
+    def __init__(
+        self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs
+    ):
         super().__init__(file_path, permissive=False, **kwargs)
 
-        self.scp = SCPList.load(self.file_path, sep=scp_sep)
+        self.feature_set = FeatureSet.load(self.file_path)
         if self.num_parts > 1:
-            self.scp = self.scp.split(
-                self.part_idx, self.num_parts, group_by_key=self.split_by_key
-            )
+            self.feature_set = self.feature_set.split(self.part_idx, self.num_parts)
         if path_prefix is not None:
-            self.scp.add_prefix_to_filepath(path_prefix)
+            self.feature_set.add_prefix_to_storage_path(path_prefix)
 
     @property
     def keys(self):
-        return self.scp.key
+        return self.feature_set["id"]
 
     def reset(self):
         """Closes all the open hdf5 files and puts the read pointer pointing
@@ -300,9 +308,9 @@ def reset(self):
 
     def eof(self):
         """Returns True when all the elements in the scp have been read."""
-        return self.cur_item == len(self.scp)
+        return self.cur_item == len(self.feature_set)
 
-    def read_shapes(self, num_records=0, assert_same_dim=True):
+    def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -316,7 +324,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
           List of tuples with num_records shapes.
         """
         if num_records == 0:
-            num_records = len(self.scp) - self.cur_item
+            num_records = len(self.feature_set) - self.cur_item
 
         keys = []
         shapes = []
@@ -324,14 +332,15 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
             if self.eof():
                 break
 
-            key, file_path, offset, range_spec = self.scp[self.cur_item]
-
-            row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
-
-            self._open_archive(file_path)
+            feature_spec = self.feature_set.iloc[self.cur_item]
+            key = feature_spec["id"]
 
+            self._open_archive(feature_spec["storage_path"])
             shape_i = self.f[key].shape
-            shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
+                shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
 
             keys.append(key)
             shapes.append(shape_i)
@@ -343,7 +352,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True):
 
         return keys, shapes
 
-    def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        num_records: int = 0,
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads next num_records feature matrices/vectors.
 
         Args:
@@ -362,14 +377,10 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
           data: List of feature matrices/vectors or 3D/2D numpy array.
         """
         if num_records == 0:
-            num_records = len(self.scp) - self.cur_item
+            num_records = len(self.feature_set) - self.cur_item
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
 
         keys = []
         data = []
@@ -378,7 +389,13 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0):
                 if self.eof():
                     break
 
-                key, file_path, offset, range_spec = self.scp[self.cur_item]
+                feature_spec = self.feature_set.iloc[self.cur_item]
+                key = feature_spec["id"]
+                file_path = feature_spec["storage_path"]
+                if "start" in feature_spec and "num_frames" in feature_spec:
+                    range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                else:
+                    range_spec = None
 
                 row_offset_i = row_offset[i] if row_offset_is_list else row_offset
                 num_rows_i = num_rows[i] if num_rows_is_list else num_rows
@@ -413,11 +430,18 @@ class RandomAccessH5DataReader(RandomAccessDataReader):
                     it returns an empty matrix, if False it raises an exception.
     """
 
-    def __init__(self, file_path, transform=None, permissive=False):
+    def __init__(
+        self,
+        file_path: PathLike,
+        transform: Optional[Callable[[np.array], np.array]] = None,
+        permissive: bool = False,
+    ):
         super().__init__(file_path, transform, permissive)
         self.f = None
 
-    def read_num_rows(self, keys, assert_same_dim=True):
+    def read_num_rows(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the number of rows in the feature matrices of the dataset.
 
         Args:
@@ -433,7 +457,9 @@ def read_num_rows(self, keys, assert_same_dim=True):
         num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int)
         return num_rows
 
-    def read_dims(self, keys, assert_same_dim=True):
+    def read_dims(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the number of columns in the feature matrices of the dataset.
 
         Args:
@@ -463,7 +489,7 @@ class RandomAccessH5FileDataReader(RandomAccessH5DataReader):
                    it returns an empty matrix, if False it raises an exception.
     """
 
-    def __init__(self, file_path, **kwargs):
+    def __init__(self, file_path: PathLike, **kwargs):
         super().__init__(file_path, **kwargs)
         self.lock = multiprocessing.Lock()
         self._open_archive(file_path)
@@ -474,7 +500,7 @@ def close(self):
             self.f.close()
             self.f = None
 
-    def _open_archive(self, file_path):
+    def _open_archive(self, file_path: PathLike):
         """Open the hdf5 file it it is not open."""
         if self.f is None:
             self.close()
@@ -484,7 +510,9 @@ def _open_archive(self, file_path):
     def keys(self):
         return list(self.f.keys())
 
-    def read_shapes(self, keys, assert_same_dim=True):
+    def read_shapes(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -518,7 +546,13 @@ def read_shapes(self, keys, assert_same_dim=True):
 
         return shapes
 
-    def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        keys: Union[str, List[str], np.array],
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads the feature matrices/vectors for the recordings in keys.
 
         Args:
@@ -539,12 +573,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
         if isinstance(keys, str):
             keys = [keys]
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
         if row_offset_is_list:
             assert len(row_offset) == len(keys)
         if num_rows_is_list:
@@ -589,17 +619,20 @@ class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader):
                    features after reading them from disk.
         permissive: If True, if the data that we want to read is not in the file
                     it returns an empty matrix, if False it raises an exception.
-        scp_sep: Separator for scp files (default ' ').
     """
 
-    def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs):
+    def __init__(
+        self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs
+    ):
         super().__init__(file_path, **kwargs)
 
-        self.scp = SCPList.load(self.file_path, sep=scp_sep)
+        self.feature_set = FeatureSet.load(self.file_path)
         if path_prefix is not None:
-            self.scp.add_prefix_to_filepath(path_prefix)
+            self.feature_set.add_prefix_to_storage_path(path_prefix)
 
-        archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True)
+        archives, archive_idx = np.unique(
+            self.feature_set["storage_path"], return_inverse=True
+        )
         self.archives = archives
         self.archive_idx = archive_idx
         self.f = [None] * len(self.archives)
@@ -614,9 +647,9 @@ def close(self):
 
     @property
     def keys(self):
-        return self.scp.key
+        return self.feature_set["id"]
 
-    def _open_archive(self, key_idx):
+    def _open_archive(self, key_idx: int):
         """Opens the hdf5 file correspoding to a given feature/matrix
            if it is not already open.
 
@@ -633,7 +666,9 @@ def _open_archive(self, key_idx):
 
         return self.f[archive_idx], self.locks[archive_idx]
 
-    def read_shapes(self, keys, assert_same_dim=True):
+    def read_shapes(
+        self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True
+    ):
         """Reads the shapes in the feature matrices of the dataset.
 
         Args:
@@ -651,18 +686,15 @@ def read_shapes(self, keys, assert_same_dim=True):
         shapes = []
         for key in keys:
 
-            if not (key in self.scp):
+            if not (key in self.feature_set.index):
                 if self.permissive:
                     shapes.append((0,))
                     continue
                 else:
                     raise Exception("Key %s not found" % key)
 
-            index = self.scp.get_index(key)
-            _, file_path, offset, range_spec = self.scp[index]
-
-            row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
-
+            index = self.feature_set.get_loc(key)
+            feature_spec = self.feature_set.loc[key]
             f, lock = self._open_archive(index)
             if not (key in f):
                 if self.permissive:
@@ -673,8 +705,12 @@ def read_shapes(self, keys, assert_same_dim=True):
 
             with lock:
                 shape_i = f[key].shape
-            shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
-            # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.))
+
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+                row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0)
+                shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i)
+
             shapes.append(shape_i)
 
         if assert_same_dim:
@@ -683,7 +719,13 @@ def read_shapes(self, keys, assert_same_dim=True):
 
         return shapes
 
-    def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
+    def read(
+        self,
+        keys: Union[str, List[str], np.array],
+        squeeze: bool = False,
+        row_offset: int = 0,
+        num_rows: int = 0,
+    ):
         """Reads the feature matrices/vectors for the recordings in keys.
 
         Args:
@@ -704,12 +746,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
         if isinstance(keys, str):
             keys = [keys]
 
-        row_offset_is_list = isinstance(row_offset, list) or isinstance(
-            row_offset, np.ndarray
-        )
-        num_rows_is_list = isinstance(num_rows, list) or isinstance(
-            num_rows, np.ndarray
-        )
+        row_offset_is_list = isinstance(row_offset, (list, np.ndarray))
+        num_rows_is_list = isinstance(num_rows, (list, np.ndarray))
         if row_offset_is_list:
             assert len(row_offset) == len(keys)
         if num_rows_is_list:
@@ -718,15 +756,19 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0):
         data = []
         for i, key in enumerate(keys):
 
-            if not (key in self.scp):
+            if not (key in self.feature_set.index):
                 if self.permissive:
                     data.append(np.array([], dtype=float_cpu()))
                     continue
                 else:
                     raise Exception("Key %s not found" % key)
 
-            index = self.scp.get_index(key)
-            _, file_path, offset, range_spec = self.scp[index]
+            index = self.feature_set.get_loc(key)
+            feature_spec = self.feature_set.loc[key]
+            if "start" in feature_spec and "num_frames" in feature_spec:
+                range_spec = [feature_spec["start"], feature_spec["num_frames"]]
+            else:
+                range_spec = None
 
             row_offset_i = row_offset[i] if row_offset_is_list else row_offset
             num_rows_i = num_rows[i] if num_rows_is_list else num_rows
diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py
index fed91d1e..c34aa0ca 100644
--- a/hyperion/io/h5_data_writer.py
+++ b/hyperion/io/h5_data_writer.py
@@ -3,7 +3,7 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-import sys
+from typing import Union, Optional, List
 
 import h5py
 import numpy as np
@@ -11,7 +11,7 @@
 from ..hyp_defs import float_save
 from ..utils.kaldi_io_funcs import is_token
 from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
-from ..utils.scp_list import SCPList
+from ..utils import PathLike
 from .data_writer import DataWriter
 
 
@@ -27,18 +27,18 @@ class H5DataWriter(DataWriter):
                           {auto (default), speech_feat,
                            2byte-auto, 2byte-signed-integer,
                            1byte-auto, 1byte-unsigned-integer, 1byte-0-1}.
-      scp_sep: Separator for scp files (default ' ').
     """
 
-    def __init__(self, archive_path, script_path=None, **kwargs):
+    def __init__(
+        self, archive_path: PathLike, script_path: Optional[PathLike] = None, **kwargs
+    ):
 
         super().__init__(archive_path, script_path, **kwargs)
 
         self.f = h5py.File(archive_path, "w")
-        if script_path is None:
-            self.f_script = None
-        else:
-            self.f_script = open(script_path, "w")
+        if script_path is not None and not self.script_is_scp:
+            row = self.script_sep.join(["id", "storage_path"])
+            self.f_script.write(f"{row}\n")
 
     def __exit__(self, exc_type, exc_value, traceback):
         """Function required when exiting from contructions of type
@@ -64,7 +64,7 @@ def flush(self):
         if self.f_script is not None:
             self.f_script.flush()
 
-    def _convert_data(self, data):
+    def _convert_data(self, data: np.array):
         """Converts data to the format for saving.
         Compresses the data it needed.
         Args:
@@ -85,7 +85,11 @@ def _convert_data(self, data):
         else:
             raise ValueError("Data is not ndarray")
 
-    def write(self, keys, data):
+    def write(
+        self,
+        keys: Union[str, List[str], np.array],
+        data: Union[np.array, List[np.array]],
+    ):
         """Writes data to file.
 
         Args:
@@ -108,9 +112,11 @@ def write(self, keys, data):
                     dset.attrs[k] = v
 
             if self.f_script is not None:
-                self.f_script.write(
-                    "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path)
-                )
+                if self.script_is_scp:
+                    self.f_script.write(f"{key_i} {self.archive_path}\n")
+                else:
+                    row = self.script_sep.join([key_i, self.archive_path])
+                    self.f_script.write(f"{row}\n")
 
             if self._flush:
                 self.flush()
diff --git a/hyperion/io/old_audio_reader.py b/hyperion/io/old_audio_reader.py
new file mode 100644
index 00000000..341f04a4
--- /dev/null
+++ b/hyperion/io/old_audio_reader.py
@@ -0,0 +1,477 @@
+"""
+ Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import io
+import logging
+import math
+import os
+import subprocess
+
+import numpy as np
+import soundfile as sf
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+
+from ..hyp_defs import float_cpu
+from ..utils import SCPList, SegmentList
+
+valid_ext = [
+    ".wav",
+    ".flac",
+    ".ogg",
+    ".au",
+    ".avr",
+    ".caf",
+    ".htk",
+    ".iff",
+    ".mat",
+    ".mpc",
+    ".oga",
+    ".pvf",
+    ".rf64",
+    ".sd2",
+    ".sds",
+    ".sf",
+    ".voc",
+    "w64",
+    ".wve",
+    ".xi",
+]
+
+
+class AudioReader(object):
+    """Class to read audio files from wav, flac or pipe
+
+    Attributes:
+         file_path:     scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object.
+         segments_path: segments file with format: segment_id file_id tbeg tend
+         wav_scale:     multiplies signal by scale factor
+    """
+
+    def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1):
+        self.file_path = file_path
+        if isinstance(file_path, SCPList):
+            self.scp = file_path
+        else:
+            self.scp = SCPList.load(file_path, sep=" ", is_wav=True)
+
+        self.segments_path = segments_path
+        if segments_path is None:
+            self.segments = None
+            self.with_segments = False
+        else:
+            self.with_segments = True
+            if isinstance(file_path, SegmentList):
+                self.segments = segments_path
+            else:
+                self.segments = SegmentList.load(
+                    segments_path, sep=" ", index_by_file=False
+                )
+
+        self.wav_scale = wav_scale
+
+    @property
+    def keys(self):
+        if self.with_segments:
+            return np.asarray(self.segments["segment_id"])
+        return self.scp.key
+
+    def __enter__(self):
+        """Function required when entering contructions of type
+
+        with AudioReader('file.h5') as f:
+           keys, data = f.read()
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """Function required when exiting from contructions of type
+
+        with AudioReader('file.h5') as f:
+           keys, data = f.read()
+        """
+        pass
+
+    @staticmethod
+    def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0):
+        """Reads an audiospecifier (audio_file/pipe)
+           It reads from pipe or from all the files that can be read
+           by `libsndfile <http://www.mega-nerd.com/libsndfile/#Features>`
+
+        Args:
+          wavspecifier: A pipe, wav, flac, ogg file etc.
+          scale:        Multiplies signal by scale factor
+          time_offset: float indicating the start time to read in the utterance.
+          time_durs: floats indicating the number of seconds to read from the utterance,
+                     if 0 it reads untils the end
+
+        """
+        wavspecifier = wavspecifier.strip()
+        if wavspecifier[-1] == "|":
+            wavspecifier = wavspecifier[:-1]
+            x, fs = AudioReader.read_pipe(wavspecifier, scale)
+            if time_offset == 0 and time_dur == 0:
+                return x, fs
+
+            start_sample = int(math.floor(time_offset * fs))
+            num_samples = int(math.floor(time_dur * fs))
+            if num_samples == 0:
+                return x[start_sample:], fs
+
+            end_sample = start_sample + num_samples
+            assert end_sample <= len(x)
+            return x[start_sample:end_sample], fs
+
+        ext = os.path.splitext(wavspecifier)[1]
+        if ext in valid_ext:
+            if time_offset == 0 and time_dur == 0:
+                x, fs = sf.read(wavspecifier, dtype=float_cpu())
+                x *= scale
+                return x, fs
+
+            with sf.SoundFile(wavspecifier, "r") as f:
+                fs = f.samplerate
+                start_sample = int(math.floor(time_offset * fs))
+                num_samples = int(math.floor(time_dur * fs))
+                f.seek(start_sample)
+                if num_samples > 0:
+                    x = scale * f.read(num_samples, dtype=float_cpu())
+                else:
+                    x = scale * f.read(dtype=float_cpu())
+                return x, fs
+
+        raise Exception("Unknown format for %s" % (wavspecifier))
+
+    @staticmethod
+    def read_pipe(wavspecifier, scale=2 ** 15):
+        """Reads wave file from a pipe
+        Args:
+          wavspecifier: Shell command with pipe output
+          scale:        Multiplies signal by scale factor
+        """
+        # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE)
+        pipe = proc.communicate()[0]
+        if proc.returncode != 0:
+            raise Exception(
+                "Wave read pipe command %s returned code %d"
+                % (wavspecifier, proc.returncode)
+            )
+        x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu())
+        x *= scale
+        return x, fs
+
+    def _read_segment(self, segment, time_offset=0, time_dur=0):
+        """Reads a wave segment
+
+        Args:
+          segment: pandas DataFrame (segment_id , file_id, tbeg, tend)
+        Returns:
+          Wave, sampling frequency
+        """
+        file_id = segment["file_id"]
+        t_beg = segment["tbeg"] + time_offset
+        t_end = segment["tend"]
+        if time_dur > 0:
+            t_end_new = t_beg + time_dur
+            assert t_end_new <= t_end
+            t_end = t_end_new
+
+        file_path, _, _ = self.scp[file_id]
+        x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale)
+        num_samples_i = len(x_i)
+        s_beg = int(t_beg * fs_i)
+        if s_beg >= num_samples_i:
+            raise Exception(
+                "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)"
+                % (file_id, t_beg, s_beg, file_id, num_samples_i)
+            )
+
+        s_end = int(t_end * fs_i)
+        if s_end > num_samples_i or t_end < 0:
+            s_end = num_samples_i
+
+        x_i = x_i[s_beg:s_end]
+        return x_i, fs_i
+
+    def read(self):
+        pass
+
+
+class SequentialAudioReader(AudioReader):
+    def __init__(
+        self,
+        file_path,
+        segments_path=None,
+        wav_scale=2 ** 15 - 1,
+        part_idx=1,
+        num_parts=1,
+    ):
+        super().__init__(file_path, segments_path, wav_scale=wav_scale)
+        self.cur_item = 0
+        self.part_idx = part_idx
+        self.num_parts = num_parts
+        if self.num_parts > 1:
+            if self.with_segments:
+                self.segments = self.segments.split(self.part_idx, self.num_parts)
+            else:
+                self.scp = self.scp.split(
+                    self.part_idx, self.num_parts, group_by_key=False
+                )
+
+    def __iter__(self):
+        """Needed to build an iterator, e.g.:
+        r = SequentialAudioReader(...)
+        for key, s, fs in r:
+           print(key)
+           process(s)
+        """
+        return self
+
+    def __next__(self):
+        """Needed to build an iterator, e.g.:
+        r = SequentialAudioReader(...)
+        for key , s, fs in r:
+           process(s)
+        """
+        key, x, fs = self.read(1)
+        if len(key) == 0:
+            raise StopIteration
+        return key[0], x[0], fs[0]
+
+    def next(self):
+        """__next__ for Python 2"""
+        return self.__next__()
+
+    def reset(self):
+        """Returns the file pointer to the begining of the dataset,
+        then we can start reading the features again.
+        """
+        self.cur_item = 0
+
+    def eof(self):
+        """End of file.
+
+        Returns:
+          True, when we have read all the recordings in the dataset.
+        """
+        if self.with_segments:
+            return self.cur_item == len(self.segments)
+        return self.cur_item == len(self.scp)
+
+    def read(self, num_records=0, time_offset=0, time_durs=0):
+        """Reads next num_records audio files
+
+        Args:
+          num_records: Number of audio files to read.
+          time_offset: List of floats indicating the start time to read in the utterance.
+          time_durs: List of floats indicating the number of seconds to read from each utterance
+
+        Returns:
+          key: List of recording names.
+          data: List of waveforms
+          fs: list of sample freqs
+        """
+        if num_records == 0:
+            if self.with_segments:
+                num_records = len(self.segments) - self.cur_item
+            else:
+                num_records = len(self.scp) - self.cur_item
+
+        offset_is_list = isinstance(time_offset, (list, np.ndarray))
+        dur_is_list = isinstance(time_durs, (list, np.ndarray))
+
+        keys = []
+        data = []
+        fs = []
+        for i in range(num_records):
+            if self.eof():
+                break
+
+            offset_i = time_offset[i] if offset_is_list else time_offset
+            dur_i = time_durs[i] if dur_is_list else time_durs
+
+            if self.with_segments:
+                segment = self.segments[self.cur_item]
+                key = segment["segment_id"]
+                x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
+            else:
+                key, file_path, _, _ = self.scp[self.cur_item]
+                x_i, fs_i = self.read_wavspecifier(
+                    file_path, self.wav_scale, offset_i, dur_i
+                )
+
+            keys.append(key)
+            data.append(x_i)
+            fs.append(fs_i)
+            self.cur_item += 1
+
+        return keys, data, fs
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = ("part_idx", "num_parts", "wav_scale")
+        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--wav-scale",
+            default=2 ** 15 - 1,
+            type=float,
+            help=("multiplicative factor for waveform"),
+        )
+        try:
+            parser.add_argument(
+                "--part-idx",
+                type=int,
+                default=1,
+                help=(
+                    "splits the list of files into num-parts and " "processes part-idx"
+                ),
+            )
+            parser.add_argument(
+                "--num-parts",
+                type=int,
+                default=1,
+                help=(
+                    "splits the list of files into num-parts and " "processes part-idx"
+                ),
+            )
+        except:
+            pass
+
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix, action=ActionParser(parser=parser),
+            )
+
+    add_argparse_args = add_class_args
+
+
+class RandomAccessAudioReader(AudioReader):
+    def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1):
+        super().__init__(file_path, segments_path, wav_scale)
+
+    def _read(self, keys, time_offset=0, time_durs=0):
+        """Reads the waveforms  for the recordings in keys.
+
+        Args:
+          keys: List of recording/segment_ids names.
+
+        Returns:
+          data: List of waveforms
+        """
+        if isinstance(keys, str):
+            keys = [keys]
+
+        offset_is_list = isinstance(time_offset, (list, np.ndarray))
+        dur_is_list = isinstance(time_durs, (list, np.ndarray))
+
+        data = []
+        fs = []
+        for i, key in enumerate(keys):
+
+            offset_i = time_offset[i] if offset_is_list else time_offset
+            dur_i = time_durs[i] if dur_is_list else time_durs
+
+            if self.with_segments:
+                if not (key in self.segments):
+                    raise Exception("Key %s not found" % key)
+
+                segment = self.segments[key]
+                x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
+            else:
+                if not (key in self.scp):
+                    raise Exception("Key %s not found" % key)
+
+                file_path, _, _ = self.scp[key]
+                x_i, fs_i = self.read_wavspecifier(
+                    file_path, self.wav_scale, offset_i, dur_i
+                )
+
+            data.append(x_i)
+            fs.append(fs_i)
+
+        return data, fs
+
+    def read(self, keys, time_offset=0, time_durs=0):
+        """Reads the waveforms  for the recordings in keys.
+
+        Args:
+          keys: List of recording/segment_ids names.
+
+        Returns:
+          data: List of waveforms
+          fs: List of sampling freq.
+        """
+        try:
+            x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs)
+        except:
+            if isinstance(keys, str):
+                keys = [keys]
+
+            if not isinstance(time_offset, (list, np.ndarray)):
+                time_offset = [time_offset] * len(keys)
+            if not isinstance(time_durs, (list, np.ndarray)):
+                time_durs = [time_durs] * len(keys)
+
+            try:
+                # some files produce error in the fseek after reading the data,
+                # this seems an issue from pysoundfile or soundfile lib itself
+                # we try to read from
+                # time-offset to the end of the file, and remove the extra frames later,
+                # this solves the problem in most cases
+                logging.info(
+                    (
+                        "error-1 reading at keys={} offset={} "
+                        "retrying reading until end-of-file ..."
+                    ).format(keys, time_offset)
+                )
+                x, fs = self._read(keys, time_offset=time_offset)
+                for i in range(len(x)):
+                    end_sample = int(time_durs[i] * fs[i])
+                    x[i] = x[i][:end_sample]
+            except:
+                # try to read the full file
+                logging.info(
+                    (
+                        "error-2 reading at key={}, " "retrying reading full file ..."
+                    ).format(keys)
+                )
+                x, fs = self._read(keys)
+                for i in range(len(x)):
+                    start_sample = int(time_offset[i] * fs[i])
+                    end_sample = start_sample + int(time_durs[i] * fs[i])
+                    x[i] = x[i][start_sample:end_sample]
+
+        return x, fs
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = ("wav_scale",)
+        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--wav-scale",
+            default=2 ** 15 - 1,
+            type=float,
+            help=("multiplicative factor for waveform"),
+        )
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix, action=ActionParser(parser=parser),
+            )
+
+    add_argparse_args = add_class_args
diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py
index 32032d1d..fff1ab4a 100644
--- a/hyperion/io/vad_rw_factory.py
+++ b/hyperion/io/vad_rw_factory.py
@@ -6,8 +6,7 @@
 import logging
 
 from .bin_vad_reader import BinVADReader as BVR
-from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier,
-                            WSpecType)
+from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType
 from .segment_vad_reader import SegmentVADReader as SVR
 
 
@@ -16,7 +15,6 @@ class VADReaderFactory(object):
     def create(
         rspecifier,
         path_prefix=None,
-        scp_sep=" ",
         frame_length=25,
         frame_shift=10,
         snip_edges=False,
@@ -33,7 +31,6 @@ def create(
                 return BVR(
                     rspecifier,
                     path_prefix,
-                    scp_sep,
                     frame_length=frame_length,
                     frame_shift=frame_shift,
                     snip_edges=snip_edges,
@@ -48,7 +45,6 @@ def create(
                 return BVR(
                     rspecifier,
                     path_prefix,
-                    scp_sep,
                     frame_length=frame_length,
                     frame_shift=frame_shift,
                     snip_edges=snip_edges,
@@ -57,7 +53,6 @@ def create(
     @staticmethod
     def filter_args(**kwargs):
         valid_args = (
-            "scp_sep",
             "path_prefix",
             "frame_shift",
             "frame_length",
@@ -72,9 +67,6 @@ def add_class_args(parser, prefix=None):
         else:
             p1 = "--" + prefix + "."
 
-        parser.add_argument(
-            p1 + "scp-sep", default=" ", help=("scp file field separator")
-        )
         parser.add_argument(
             p1 + "path-prefix", default=None, help=("scp file_path prefix")
         )
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 1e42a1c3..fa675fdb 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -9,7 +9,8 @@
 
 import numpy as np
 import pandas as pd
-#import k2
+
+# import k2
 import sentencepiece as spm
 import torchaudio.transforms as tat
 from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
@@ -25,16 +26,11 @@
 from ...utils.text import read_text
 from ..torch_defs import floatstr_torch
 
-#from torch.nn.utils.rnn import pad_sequence
-
-
-
 
 class AudioDataset(Dataset):
-
     def __init__(
         self,
-        audio_file,
+        recordings_file,
         segments_file,
         class_names=None,
         class_files=None,
@@ -46,7 +42,7 @@ def __init__(
         return_segment_info=None,
         return_orig=False,
         target_sample_freq=None,
-        wav_scale=2**15 - 1,
+        wav_scale=2 ** 15 - 1,
         is_val=False,
     ):
 
@@ -61,12 +57,6 @@ def __init__(
         self.rank = rank
         self.world_size = world_size
         self.epoch = 0
-
-        if rank == 0:
-            logging.info("opening audio reader %s", audio_file)
-
-        self.r = AR(audio_file, wav_scale=wav_scale)
-
         if rank == 0:
             logging.info("loading segments file %s", segments_file)
 
@@ -74,17 +64,17 @@ def __init__(
         if rank == 0:
             logging.info("dataset contains %d seqs", len(self.seg_set))
 
+        if rank == 0:
+            logging.info("opening audio reader %s", recordings_file)
+
+        audio_seg_set = self.seg_set if self.seg_set.has_time_marks else None
+        self.r = AR(recordings_file, segments=audio_seg_set, wav_scale=wav_scale)
+
         self.is_val = is_val
         if time_durs_file is not None:
-            if rank == 0:
-                logging.info("loading durations file %s", time_durs_file)
+            self._load_legacy_durations(time_durs_file)
 
-            time_durs = SegmentSet.load(time_durs_file)
-            self.seg_set["duration"] = time_durs.loc[
-                self.seg_set["id"]].class_id.values.astype(np.float,
-                                                           copy=False)
-        else:
-            assert "duration" in self.seg_set
+        assert "duration" in self.seg_set
 
         logging.info("loading class-info files")
         self._load_class_infos(class_names, class_files, is_val)
@@ -96,8 +86,9 @@ def __init__(
         if text_file is not None:
             logging.info("loading text files")
             self._load_text_infos(text_file, is_val)
-        self.return_segment_info = ([] if return_segment_info is None else
-                                    return_segment_info)
+        self.return_segment_info = (
+            [] if return_segment_info is None else return_segment_info
+        )
         self.return_orig = return_orig
 
         self.num_augs = num_augs
@@ -106,9 +97,18 @@ def __init__(
         self.target_sample_freq = target_sample_freq
         self.resamplers = {}
 
+    def _load_legacy_durations(self, time_durs_file):
+        if self.rank == 0:
+            logging.info("loading durations file %s", time_durs_file)
+
+        time_durs = SegmentSet.load(time_durs_file)
+        self.seg_set["duration"] = time_durs.loc[
+            self.seg_set["id"]
+        ].class_id.values.astype(np.float, copy=False)
+
     def _load_bpe_model(self, bpe_model, is_val):
         if self.rank == 0:
-            logging.info("loading bpe file %s" % bpe_model)
+            logging.info("loading bpe file %s", bpe_model)
         self.sp = spm.SentencePieceProcessor()
         self.sp.load(bpe_model)
         blank_id = self.sp.piece_to_id("<blk>")
@@ -118,7 +118,7 @@ def _load_text_infos(self, text_file, is_val):
         if text_file is None:
             return
         if self.rank == 0:
-            logging.info("loading text file %s" % text_file)
+            logging.info("loading text file %s", text_file)
 
         text = read_text(text_file)
         self.seg_set["text"] = text.loc[self.seg_set["id"]].text
@@ -131,8 +131,9 @@ def _load_class_infos(self, class_names, class_files, is_val):
 
         assert len(class_names) == len(class_files)
         for name, file in zip(class_names, class_files):
-            assert (name in self.seg_set
-                    ), f"class_name {name} not present in the segment set"
+            assert (
+                name in self.seg_set
+            ), f"class_name {name} not present in the segment set"
             if self.rank == 0:
                 logging.info("loading class-info file %s" % file)
             table = ClassInfo.load(file)
@@ -143,8 +144,9 @@ def _load_class_infos(self, class_names, class_files, is_val):
                 segment_class_ids = self.seg_set[name].unique()
                 for c_id in class_ids:
                     if c_id not in segment_class_ids:
-                        logging.warning("%s class: %s not present in dataset",
-                                        name, c_id)
+                        logging.warning(
+                            "%s class: %s not present in dataset", name, c_id
+                        )
 
     def _create_augmenters(self, aug_cfgs):
         self.augmenters = []
@@ -154,12 +156,11 @@ def _create_augmenters(self, aug_cfgs):
 
         for aug_cfg in aug_cfgs:
             logging.info(f"loading augmentation={aug_cfg}")
-            augmenter = SpeechAugment.create(aug_cfg,
-                                             random_seed=112358 +
-                                             1000 * self.rank)
+            augmenter = SpeechAugment.create(
+                aug_cfg, random_seed=112358 + 1000 * self.rank
+            )
             self.augmenters.append(augmenter)
-            self.reverb_context = max(augmenter.max_reverb_context,
-                                      self.reverb_context)
+            self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context)
 
     def set_epoch(self, epoch):
         self.epoch = epoch
@@ -201,12 +202,13 @@ def _parse_segment_item(self, segment):
             assert duration <= self.seg_set.loc[seg_id].duration, (
                 f"{seg_id} with start={start} duration "
                 f"({self.seg_set.loc[seg_id].duration}) < "
-                f"chunk duration ({duration})")
+                f"chunk duration ({duration})"
+            )
         else:
             seg_id, start, duration = segment, 0, 0
 
-        if "start" in self.seg_set:
-            start += self.seg_set.loc[seg_id].start
+        # if "start" in self.seg_set:
+        #     start += self.seg_set.loc[seg_id].start
 
         return seg_id, start, duration
 
@@ -217,14 +219,23 @@ def _read_audio(self, seg_id, start, duration):
         start -= reverb_context
         read_duration = duration + reverb_context
 
+        # read audio
+        x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration)
+        return x[0].astype(floatstr_torch(), copy=False), fs[0]
+
+    def _read_audio0(self, seg_id, start, duration):
+        # how much extra audio we need to load to
+        # calculate the reverb of the first part of the audio
+        reverb_context = min(self.reverb_context, start)
+        start -= reverb_context
+        read_duration = duration + reverb_context
+
         # read audio
         recording_id = self.seg_set.recording_ids(seg_id)
-        x, fs = self.r.read([recording_id],
-                            time_offset=start,
-                            time_durs=read_duration)
+        x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration)
         return x[0].astype(floatstr_torch(), copy=False), fs[0]
 
-    def _apply_augs(self, x, num_samples, reverb_context_samples):
+    def _apply_augs(self, x, reverb_context_samples):
         x_augs = {}
         # for each type of augmentation
         for i, augmenter in enumerate(self.augmenters):
@@ -233,7 +244,7 @@ def _apply_augs(self, x, num_samples, reverb_context_samples):
                 # augment x
                 x_aug, aug_info = augmenter(x)
                 # remove the extra left context used to compute the reverberation.
-                x_aug = x_aug[reverb_context_samples:len(x)]
+                x_aug = x_aug[reverb_context_samples : len(x)]
                 x_aug = x_aug.astype(floatstr_torch(), copy=False)
                 x_augs[f"x_aug_{i}_{j}"] = x_aug
 
@@ -300,7 +311,7 @@ def __getitem__(self, segment):
             else:
                 num_samples = int(duration * fs)
             reverb_context_samples = len(x) - num_samples
-            x_augs = self._apply_augs(x, num_samples, reverb_context_samples)
+            x_augs = self._apply_augs(x, reverb_context_samples)
             data.update(x_augs)
 
             # add original non augmented audio
@@ -311,15 +322,6 @@ def __getitem__(self, segment):
         else:
             data["x"] = x
 
-        # try:
-        #     import soundfile as sf
-
-        #     for i, z in enumerate(r):
-        #         sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16")
-        # except:
-        #     print("soundfile failed", flush=True)
-
-        # adds the segment labels
         seg_info = self._get_segment_info(seg_id)
         data.update(seg_info)
         return data
@@ -329,7 +331,7 @@ def filter_args(**kwargs):
 
         ar_args = AR.filter_args(**kwargs)
         valid_args = (
-            "audio_file",
+            "recordings_file",
             "segments_file",
             "aug_cfgs",
             "num_augs",
@@ -352,48 +354,43 @@ def add_class_args(parser, prefix=None, skip=set()):
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
-        if "audio_file" not in skip:
+        if "recordings_file" not in skip:
             parser.add_argument(
-                "--audio-file",
+                "--recordings-file",
                 required=True,
-                help=("audio manifest file"),
+                help=("recordings manifest file (kaldi .scp or pandas .csv)"),
             )
 
         if "segments_file" not in skip:
             parser.add_argument(
                 "--segments-file",
                 required=True,
-                help=("segments manifest file"),
+                help=("segments manifest file (kaldi .scp or pandas .csv)"),
             )
 
         parser.add_argument(
             "--class-names",
             default=None,
             nargs="+",
-            help=
-            ("list with the names of the types of classes in the datasets, e.g., speaker, language"
-             ),
+            help=(
+                "list with the names of the types of classes in the datasets, e.g., speaker, language"
+            ),
         )
 
         parser.add_argument(
-            "--class-files",
-            default=None,
-            nargs="+",
-            help=("list of class info files"),
+            "--class-files", default=None, nargs="+", help=("list of class info files"),
         )
 
         parser.add_argument(
             "--time-durs-file",
             default=None,
-            help=
-            ("segment to duration in secs file, if durations are not in segments_file"
-             ),
+            help=(
+                "(deprecated) segment to duration in secs file, if durations are not in segments_file"
+            ),
         )
 
         parser.add_argument(
-            "--bpe-model",
-            default=None,
-            help=("bpe model for the text label"),
+            "--bpe-model", default=None, help=("bpe model for the text label"),
         )
 
         parser.add_argument(
@@ -418,32 +415,31 @@ def add_class_args(parser, prefix=None, skip=set()):
             "--return-segment-info",
             default=None,
             nargs="+",
-            help=
-            ("list of columns of the segment file which should be returned as supervisions"
-             ),
+            help=(
+                "list of columns of the segment file which should be returned as supervisions"
+            ),
         )
         parser.add_argument(
             "--return-orig",
             default=False,
             action=ActionYesNo,
-            help=
-            ("when using augmentation, whether or not to return also the original audio"
-             ),
+            help=(
+                "when using augmentation, whether or not to return also the original audio"
+            ),
         )
 
         parser.add_argument(
             "--target-sample-freq",
             default=None,
             type=int,
-            help=
-            ("target sampling frequencey, if not None all audios are converted to this sample freq"
-             ),
+            help=(
+                "target sampling frequencey, if not None all audios are converted to this sample freq"
+            ),
         )
 
         AR.add_class_args(parser)
         if prefix is not None:
-            outer_parser.add_argument("--" + prefix,
-                                      action=ActionParser(parser=parser))
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
             # help='audio dataset options')
 
     add_argparse_args = add_class_args
diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py
index 2b2f0aaf..7e40dfd6 100644
--- a/hyperion/utils/feature_set.py
+++ b/hyperion/utils/feature_set.py
@@ -9,6 +9,7 @@
 import pandas as pd
 
 from .info_table import InfoTable
+from .misc import PathLike
 
 
 class FeatureSet(InfoTable):
@@ -16,6 +17,9 @@ def __init__(self, df):
         super().__init__(df)
         assert "storage_path" in df
 
+    def add_prefix_to_storage_path(self, prefix: PathLike):
+        self.df["storge_path"] = self.df["storage_path"].apply(lambda x: f"{prefix}{x}")
+
     def save(self, file_path, sep=None):
         """Saves info table to file
 
@@ -31,14 +35,14 @@ def save(self, file_path, sep=None):
             from .scp_list import SCPList
 
             offset = self.df["storage_byte"] if "storage_byte" in self.df else None
-            range = None
+            range_spec = None
             if "start" and "num_frames" in self.df:
-                range = [
+                range_spec = [
                     np.array([s, n], dtype=np.int64)
                     for s, n in self.df[["start", "num_frames"]]
                 ]
             scp = SCPList(
-                self.df["id"].values, self.df["storage_path"].values, offset, range
+                self.df["id"].values, self.df["storage_path"].values, offset, range_spec
             )
             scp.save(file_path)
             return
@@ -67,9 +71,9 @@ def load(cls, file_path, sep=None):
             if scp.offset is not None:
                 df["storage_byte"] = scp.offset
 
-            if scp.range is not None:
-                df["start"] = [r[0] for r in scp.range]
-                df["num_frames"] = [r[0] for r in scp.range]
+            if scp.range_spec is not None:
+                df["start"] = [r[0] for r in scp.range_spec]
+                df["num_frames"] = [r[1] for r in scp.range_spec]
 
             return cls(df)
 
diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py
index a3a1da27..5a4f27d2 100644
--- a/hyperion/utils/info_table.py
+++ b/hyperion/utils/info_table.py
@@ -22,6 +22,7 @@ class InfoTable:
     Attributes:
       df: pandas dataframe.
     """
+
     def __init__(self, df):
         self.df = df
         assert "id" in df, f"info_table={df}"
@@ -137,10 +138,7 @@ def load(cls, file_path, sep=None, name="class_id"):
                 sep=" ",
                 header=None,
                 names=["id", name],
-                dtype={
-                    "id": np.str,
-                    name: np.str
-                },
+                dtype={"id": np.str, name: np.str},
             )
         else:
             if sep is None:
@@ -163,17 +161,16 @@ def split(self, idx, num_parts, group_by=None):
         Args:
           idx: Part to return from 1 to num_parts.
           num_parts: Number of parts to split the list.
-          group_by_field: All the lines with the same value in column
+          group_by: All the lines with the same value in column
                           groub_by_field go to the same part
 
         Returns:
-          Sub Utt2Info object
+          Sub InfoTable object
         """
-        if group_by is None:
+        if group_by is None or group_by == "id":
             _, idx1 = split_list(self.df["id"], idx, num_parts)
         else:
-            _, idx1 = split_list_group_by_key(self.df[group_by], idx,
-                                              num_parts)
+            _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts)
 
         df = self.df.iloc[idx1]
         return self.__class__(df)
@@ -192,14 +189,10 @@ def merge(cls, tables):
         df = pd.concat(df_list)
         return cls(df)
 
-    def filter(self,
-               items=None,
-               iindex=None,
-               columns=None,
-               by="id",
-               keep=True):
-        assert (items is None or iindex is None
-                ), "items and iindex cannot be not None at the same time"
+    def filter(self, items=None, iindex=None, columns=None, by="id", keep=True):
+        assert (
+            items is None or iindex is None
+        ), "items and iindex cannot be not None at the same time"
         df = self.df
 
         if not keep:
diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py
index f9da69fa..d51edc34 100644
--- a/hyperion/utils/segment_set.py
+++ b/hyperion/utils/segment_set.py
@@ -9,9 +9,36 @@
 class SegmentSet(InfoTable):
     def __init__(self, df):
         super().__init__(df)
+        if "start" in df and "recording_id" not in df:
+            df["recording_id"] = df["id"]
+
+        if "start" not in df and "recording_id" in df:
+            df["start"] = 0.0
+
+    @property
+    def has_time_marks(self):
+        return (
+            "recording_id" in self.df and "start" in self.df and "duration" in self.df
+        )
+
+    @property
+    def has_recording_ids(self):
+        return "recording_id" in self.df
 
     def recording_ids(self, ids):
         if "recording_id" in self.df:
             return self.df.loc[ids, "recording_id"]
 
         return ids
+
+    def recording_time_marks(self, ids):
+        if "recording" in self.df:
+            rec_col = "recording_id"
+        else:
+            rec_col = "id"
+
+        assert "duration" in self.df
+        if "start" not in self.df:
+            self.df["start"] = 0.0
+
+        return self.df.loc[ids, [rec_col, "start", "duration"]]
diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py
index 9785d021..edf2c23a 100644
--- a/hyperion/utils/utt2info.py
+++ b/hyperion/utils/utt2info.py
@@ -142,7 +142,7 @@ def save(self, file_path, sep=" "):
         self.utt_info.to_csv(file_path, sep=sep, header=False, index=False)
 
     @classmethod
-    def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}):
+    def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}):
         """Loads utt2info list from text file.
 
         Args:

From c408f7428b7443761a0142a7b010dacf16aeaf2b Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Thu, 11 May 2023 14:15:47 -0400
Subject: [PATCH 19/89] some fixes in sre21

---
 ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 102 +++++++++++++
 ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml |  66 +++++++++
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  47 +-----
 .../v1.16k/local/make_voxceleb1cat_v2.pl      |   4 +-
 egs/sre21-av-a/v1.16k/run_002_compute_evad.sh |  39 -----
 .../v1.16k/run_011_train_xvector.sh           |  53 ++++++-
 egs/voxceleb/v1.1/README.md                   |  52 ++++---
 ...train_res2net50w26s4_xvec_stage1_v3.0.yaml |  72 +++++++++
 ...train_res2net50w26s4_xvec_stage2_v3.0.yaml |  69 +++++++++
 ...train_res2net50w26s8_xvec_stage1_v3.0.yaml |  72 +++++++++
 ...train_res2net50w26s8_xvec_stage2_v3.0.yaml |  69 +++++++++
 .../train_tseresnet34_xvec_stage1_v3.0.yaml   |   4 +-
 .../config_fbank80_stmn_tseresnet34.v3.0.sh   |   2 +-
 hyperion/np/classifiers/svmc.py               | 138 +++++++++---------
 hyperion/np/np_model.py                       |   5 +
 hyperion/torch/layer_blocks/res2net_blocks.py |   3 -
 hyperion/torch/trainers/xvector_trainer.py    |  29 ----
 17 files changed, 608 insertions(+), 218 deletions(-)
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
 create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml

diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..01cfa082
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
@@ -0,0 +1,102 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model: 
+  resnet_enc:
+    in_feats: 80
+    in_conv_channels: 2048
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+    - 1
+    - 1
+    - 1
+    - 1
+    resb_channels:
+    - 2048
+    resb_kernel_sizes:
+    - 3
+    resb_dilations:
+    - 2
+    - 3
+    - 4
+    - 5
+    resb_strides:
+    - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 16
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 8192
+    dropout_rate: 0.0
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 35000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..24b1c081
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
index c8732c36..1b7c3764 100644
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
+++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -9,72 +9,34 @@ vad_config=conf/vad_16k.yaml
 
 # x-vector training 
 nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
 
-batch_size_1gpu=16
 eff_batch_size=512 # effective batch size
-ipe=1
-min_chunk=4
-max_chunk=4
 lr=0.02
 
 nnet_type=resnet1d
-block_type=seres2bn # squeeze-excitation res2net bottleneck
-channels=2048
-ep_channels=8192
-width_factor=1
-scale=8
-se_r=16
 dropout=0
 
-attstats_inner=128
 embed_dim=256
 s=30
 margin_warmup=20
 margin=0.3
 
-nnet_opt="--resnet_enc.in-feats 80 \
-		     --resnet_enc.in-conv-channels $channels \
-		     --resnet_enc.in-kernel-size 5 \
-		     --resnet_enc.in-stride 1 \
-		     --resnet_enc.resb-type $block_type \
-		     --resnet_enc.resb-repeats 1 1 1 1 \
-		     --resnet_enc.resb-channels $channels \
-		     --resnet_enc.resb-kernel-sizes 3 \
-		     --resnet_enc.resb-dilations 2 3 4 5 \
-		     --resnet_enc.resb-strides 1 \
-		     --resnet_enc.res2net-width-factor $width_factor \
-		     --resnet_enc.res2net-scale $scale \
-		     --resnet_enc.se-r $se_r \
-		     --resnet_enc.multilayer \
-                     --resnet_enc.multilayer-concat \
-                     --resnet_enc.endpoint-channels $ep_channels \
-		     --pool_net.pool-type ch-wise-att-mean+stddev \
-		     --pool_net.inner-feats $attstats_inner \
-		     --embed-dim $embed_dim"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
+nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
 nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
 nnet_num_epochs=75
 nnet_dir=exp/xvector_nnets/$nnet_name
 nnet=$nnet_dir/model_ep0070.pth
 nnet=$nnet_dir/swa_model_ep0076.pth
-
+nnet=$nnet_dir/model_ep0004.pth
 # xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
 ft_eff_batch_size=128 # effective batch size
 ft_min_chunk=10
 ft_max_chunk=15
-ft_ipe=1
 ft_lr=0.01
 ft_nnet_num_epochs=15
 ft_margin=0.5
-ft_margin_warmup=3
 
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
+ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
 ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
 ft_nnet=$ft_nnet_dir/model_ep0007.pth
@@ -88,7 +50,4 @@ else
     plda_data=voxceleb2cat_train_augx${plda_num_augs}
 fi
 plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
 
diff --git a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl
index 27b1f152..18b6d40c 100755
--- a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl
+++ b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl
@@ -31,7 +31,7 @@
 my $meta_path = "$data_base/vox1_meta.csv";
 if (! -e "$meta_path") {
     $meta_path = "$out_dir/vox1_meta.csv";
-    system("wget -O $meta_path $meta_url");
+    system("wget --no-check-certificate -O $meta_path $meta_url");
 }
 
 open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
@@ -53,7 +53,7 @@
 my $lid_path = "$data_base/lang_vox1_final.csv";
 if (! -e "$lid_path") {
     $lid_path = "$out_dir/lang_vox1_final.csv";
-    system("wget -O $lid_path $lid_url");
+    system("wget --no-check-certificate -O $lid_path $lid_url");
 }
 open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
 my %utt2lang = ();
diff --git a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh
index f7aa7828..08f655ea 100755
--- a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh
+++ b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh
@@ -9,7 +9,6 @@ set -e
 nodes=fs01
 storage_name=$(date +'%m_%d_%H_%M')
 vaddir=`pwd`/exp/vad_e
-vad_config=conf/vad_16k.yaml
 
 stage=1
 config_file=default_config.sh
@@ -75,41 +74,3 @@ if [ $stage -le 3 ];then
   done
 fi
 
-# #Enroll multi-speaker Datasets with time marks
-# if [ $stage -le 3 ];then 
-#     for name in sre18_dev_enroll_vast sre18_eval_enroll_vast sre19_av_a_dev_enroll sre19_av_a_eval_enroll
-#     do
-# 	num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
-# 	nj=$(($num_spk < 40 ? $num_spk:40))
-# 	# we just run energy vad to get the utt2num_frames file
-# 	hyp_utils/feats/make_evad.sh --write-utt2num-frames true \
-# 	    --vad-config $vad_config --nj $nj --cmd "$train_cmd" \
-# 	    data/${name} exp/make_vad/$name $vaddir
-# 	utils/fix_data_dir.sh data/${name}
-# 	local/sre18_diar_to_vad.sh data/${name} exp/make_vad $vaddir
-# 	utils/fix_data_dir.sh data/${name}
-#     done
-# fi
-
-# #Dihard Datasets
-# if [ $stage -le 4 ];then
-#     for name in dihard2_train_dev dihard2_train_eval
-#     do
-# 	num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
-# 	nj=$(($num_spk < 40 ? $num_spk:40))
-# 	# we just run energy vad to get the utt2num_frames file
-# 	hyp_utils/feats/make_evad.sh --write-utt2num-frames true \
-# 	    --vad-config $vad_config --nj $nj --cmd "$train_cmd" \
-# 	    data/${name} exp/make_vad/$name $vaddir
-# 	hyp_utils/rttm_to_bin_vad.sh --nj 5 data/$name/vad.rttm data/$name $vaddir
-# 	utils/fix_data_dir.sh data/${name}
-#     done
-
-# fi
-
-# if [ $stage -le 5 ];then 
-#   utils/combine_data.sh --extra-files "utt2num_frames" data/dihard2_train data/dihard2_train_dev data/dihard2_train_eval
-#   utils/fix_data_dir.sh data/dihard2_train
-# fi
-
-
diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
index 0608929c..7f405952 100755
--- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
+++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
@@ -10,28 +10,67 @@ set -e
 stage=1
 ngpu=4
 config_file=default_config.sh
-resume=false
 interactive=false
-num_workers=8
+num_workers=""
 
 . parse_options.sh || exit 1;
 . $config_file
 . datapath.sh
 
-batch_size=$(($batch_size_1gpu*$ngpu))
-grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}')
-log_interval=$(echo 100*$grad_acc_steps | bc)
 list_dir=data/${nnet_data}_proc_audio_no_sil
 
 args=""
-if [ "$resume" == "true" ];then
-    args="--resume"
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
 fi
 
 if [ "$interactive" == "true" ];then
     export cuda_cmd=run.pl
 fi
 
+# Network Training
+if [ $stage -le 1 ]; then
+  
+  mkdir -p $nnet_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    train_xvector_from_wav.py $nnet_type \
+    --cfg $nnet_base_cfg $nnet_args $extra_args \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
+    --data.train.dataset.time-durs-file $list_dir/utt2dur \
+    --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
+    --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
+    --data.val.dataset.time-durs-file $list_dir/utt2dur \
+    --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
+    --trainer.exp-path $nnet_dir \
+    --num-gpus $ngpu \
+  
+fi
+
+# Large Margin Fine-tuning
+if [ $stage -le 2 ]; then
+  mkdir -p $ft_nnet_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $ft_nnet_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_xvector_from_wav.py $nnet_type \
+    --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
+    --data.train.dataset.time-durs-file $list_dir/utt2dur \
+    --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
+    --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
+    --data.val.dataset.time-durs-file $list_dir/utt2dur \
+    --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
+    --in-model-file $nnet \
+    --trainer.exp-path $ft_nnet_dir \
+    --num-gpus $ngpu \
+  
+fi
+exit
+
 # Network Training
 if [ $stage -le 1 ]; then
 
diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md
index 23e0a26f..73b9bb4e 100644
--- a/egs/voxceleb/v1.1/README.md
+++ b/egs/voxceleb/v1.1/README.md
@@ -104,12 +104,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 |
 | | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062|
 | | | | Cosine + QMF | 0.62 | 0.036 | 0.063 |
-| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
-| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 |
+| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 |
+| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 |
+| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 |
+| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 |
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 |
 | | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 |
 | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 |
@@ -134,16 +134,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 |
 | | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 |
 | | | | Cosine + QMF | 0.74 | 0.045 | 0.081 |
-| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
-| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 |
+| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 |
+| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 |
+| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 |
+| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 |
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076|
 | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 |
 | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 |
 
+
+
 ### VoxCeleb 1 Hard-Clean trial list
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
@@ -163,16 +165,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 |
 | | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 |
 | | | | Cosine + QMF | 1.34 | 0.079 | 0.136 |
-| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
-| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 |
+| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 |
+| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 |
+| | | | Cosine + AS-Norm |  1.58 | 0.092 | 0.152 |
+| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 |
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 |
 | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 |
 | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 |
 
+
+
 ### VoxSRC2022 dev
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
@@ -192,12 +196,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 |
 | | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 |
 | | | | Cosine + QMF | 1.87 | 0.119 | 0.216 |
-| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
-| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF |  |  |  |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 |
+| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 |
+| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 |
+| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 |
+| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 |
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 |
 | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 |
 | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 |
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..5dda7913
--- /dev/null
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+feats: fbank80_specaug1_stmn_16k.yaml
+model: 
+  resnet_type: res2net50
+  in_channels: 1
+  in_feats: 80
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 1.625
+  res2net_scale: 4
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 30.0
+  margin: 0.2
+  margin_warmup_epochs: 5.0
+  dropout_rate: 0.1
+  norm_before: false
+  hid_act: swish
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..469e166b
--- /dev/null
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 0
+  intertop_margin: 0.1
+  override_dropouts: true
+  dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..40fb362e
--- /dev/null
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+feats: fbank80_specaug1_stmn_16k.yaml
+model: 
+  resnet_type: res2net50
+  in_channels: 1
+  in_feats: 80
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 3.25
+  res2net_scale: 8
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 30.0
+  margin: 0.2
+  margin_warmup_epochs: 5.0
+  dropout_rate: 0.1
+  norm_before: false
+  hid_act: swish
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..469e166b
--- /dev/null
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 0
+  intertop_margin: 0.1
+  override_dropouts: true
+  dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
index 1d864080..31dcaf9a 100644
--- a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
+++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
@@ -47,7 +47,7 @@ model:
   dropout_rate: 0.1
   norm_before: false
   hid_act: swish
-  se_r: 128
+  se_r: 256
 trainer:
   optim: 
     opt_type: adam
@@ -67,5 +67,5 @@ trainer:
   grad_clip: 250
   use_amp: true
   log_interval: 1000
-  epochs: 35
+  epochs: 25
   eff_batch_size: 256
diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
index 42af2d52..00622772 100644
--- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
+++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
@@ -17,7 +17,7 @@ nnet_name=${feat_type}_tseresnet34.v3.0
 nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0035.pth
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
 
 nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml
 nnet_s2_name=${nnet_name}.s2
diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py
index 9311b8e8..6b54034b 100644
--- a/hyperion/np/classifiers/svmc.py
+++ b/hyperion/np/classifiers/svmc.py
@@ -9,20 +9,24 @@
 
 import numpy as np
 from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-from sklearn.svm import SVC as SVC
+from sklearn.svm import SVC
 
 from ...hyp_defs import float_cpu
 from ...utils.math import softmax
+from ...utils.misc import filter_func_args
 from ..np_model import NPModel
 
 
-class GaussianSVMC(NPModel):
+class SVMC(NPModel):
     """Gaussian Support Vector Machine for Classification."""
 
     def __init__(
         self,
         C=1.0,
+        kernel="rbf",
+        degree=3,
         gamma="scale",
+        coef0=0.0,
         shrinking=True,
         probability=True,
         tol=0.0001,
@@ -32,7 +36,6 @@ def __init__(
         class_weight=None,
         random_state=None,
         max_iter=100,
-        model=None,
         verbose=0,
         balance_class_weight=True,
         lr_seed=1024,
@@ -48,25 +51,38 @@ def __init__(
         if random_state is None:
             random_state = np.random.RandomState(seed=lr_seed)
 
+        self.C = C
+        self.kernel = kernel
+        self.degree = degree
+        self.gamma = gamma
+        self.coef0 = coef0
+        self.shrinking = shrinking
+        self.probability = probability
+        self.tol = tol
+        self.cache_size = cache_size
+        self.multi_class = multi_class
+        self.break_ties = break_ties
+        self.class_weight = class_weight
+
         self.balance_class_weight = balance_class_weight
-        if model is None:
-            self.svm = SVC(
-                C=C,
-                kernel="rbf",
-                gamma=gamma,
-                shrinking=shrinking,
-                probability=probability,
-                tol=tol,
-                cache_size=cache_size,
-                class_weight=class_weight,
-                verbose=verbose,
-                max_iter=max_iter,
-                decision_function_shape=multi_class,
-                break_ties=break_ties,
-                random_state=random_state,
-            )
-        else:
-            self.svm = model
+        self.svm = SVC(
+            C=C,
+            kernel=kernel,
+            gamma=gamma,
+            degree=degree,
+            coef0=coef0,
+            shrinking=shrinking,
+            probability=probability,
+            tol=tol,
+            cache_size=cache_size,
+            class_weight=class_weight,
+            verbose=verbose,
+            max_iter=max_iter,
+            decision_function_shape=multi_class,
+            break_ties=break_ties,
+            random_state=random_state,
+        )
+
         self.set_labels(labels)
 
     @property
@@ -84,6 +100,18 @@ def get_config(self):
           Dictionary with config hyperparams.
         """
         config = {
+            "C": self.C,
+            "kernel": self.kernel,
+            "gamma": self.gamma,
+            "degree": self.degree,
+            "coef0": self.coef0,
+            "shrinking": self.shrinking,
+            "probability": self.probability,
+            "tol": self.tol,
+            "cache_size": self.cache_size,
+            "multi_class": self.multi_class,
+            "break_ties": self.break_ties,
+            "class_weight": self.class_weight,
             "balance_class_weight": self.balance_class_weight,
             "labels": self.labels,
         }
@@ -135,7 +163,6 @@ def fit(self, x, class_ids, sample_weight=None):
           class_ids: class integer [0, num_classes-1] identifier (num_samples,)
           sample_weight: weight of each sample in the estimation (num_samples,)
         """
-        print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------")
         self.svm.fit(x, class_ids)
         if self.svm.fit_status_:
             logging.warning("SVM did not converge")
@@ -153,9 +180,6 @@ def save(self, file_path):
         if not split_path[-1] == "sav":
             file_path = "".join(split_path[0] + ".sav")
         with open(file_path, "wb") as f:
-            # with h5py.File(file_path, "w") as f:
-            # config = self.to_json()
-            # f.create_dataset("config", data=np.array(config, dtype="S"))
             self.save_params(f)
 
     @classmethod
@@ -169,27 +193,17 @@ def load(cls, file_path):
           Model object.
         """
         split_path = os.path.splitext(file_path)
-        if not split_path[-1] == "sav":
-            file_path = "".join(split_path[0] + ".sav")
+        if not split_path[-1] == "pkl":
+            file_path = "".join(split_path[0] + ".pkl")
 
-        # with h5py.File(file_path, "r") as f:
         with open(file_path, "rb") as f:
-            # json_str = str(np.asarray(f["config"]).astype("U"))
-            # config = cls.load_config_from_json(json_str)
-            config = None
-            return cls.load_params(f, config)
+            return pickle.load(f)
 
     def save_params(self, f):
-        # params = {"A": self.A, "b": self.b}
-        # self._save_params_from_dict(f, params)
         pickle.dump(self, f)
 
     @classmethod
-    def load_params(cls, f, config):
-        # param_list = ["A", "b"]
-        # params = cls._load_params_to_dict(f, config["name"], param_list)
-        # kwargs = dict(list(config.items()) + list(params.items()))
-        # return cls(**kwargs)
+    def load_params(cls, f):
         svmc = pickle.load(f)
         return svmc
 
@@ -200,27 +214,7 @@ def filter_class_args(**kwargs):
         Returns:
           Hyperparamter dictionary to initialize the class.
         """
-        valid_args = (
-            "nu",
-            "gamma",
-            "shrinking",
-            "probability",
-            "tol",
-            "cache_size",
-            "multi_class",
-            "break_ties",
-            "class_weight",
-            "random_state",
-            "max_iter",
-            "verbose",
-            "balance_class_weight",
-            "lr_seed",
-            "model",
-            "labels",
-        )
-        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
-
-    filter_train_args = filter_class_args
+        return filter_func_args(SVMC.__init__, **kwargs)
 
     @staticmethod
     def add_class_args(parser, prefix=None):
@@ -240,17 +234,27 @@ def add_class_args(parser, prefix=None):
             type=float,
             help="inverse of regularization strength",
         )
-        # parser.add_argument(
-        #     "--class_weight",
-        #     default=None,
-        #     help="Class weights",
-        # )
+        parser.add_argument(
+            "--kernel",
+            default="rbf",
+            choices=["linear", "poly", "rbf", "sigmoid", "precomputed"],
+            help="kernel for svm",
+        )
+        parser.add_argument(
+            "--degree", defaut=3, type=int, help="degree of polynomial kernel"
+        )
         parser.add_argument(
             "--gamma",
             default="scale",
             choices=["scale", "auto"],
             help="Kernel coefficient for ‘rbf’",
         )
+        parser.add_argument(
+            "--coef0",
+            default=0.0,
+            type=float,
+            help="independent term of poly and sigmoid kernels",
+        )
         parser.add_argument(
             "--shrinking",
             default=True,
@@ -264,7 +268,7 @@ def add_class_args(parser, prefix=None):
             help="Whether to enable probability estimates",
         )
         parser.add_argument(
-            "--break_ties",
+            "--break-ties",
             default=True,
             type=bool,
             help="If true, predict will break ties according to the confidence values of decision_function; otherwise \
@@ -293,7 +297,7 @@ def add_class_args(parser, prefix=None):
             ),
         )
         parser.add_argument(
-            "--cache_size",
+            "--cache-size",
             default=600,
             type=int,
             help="Specify the size of the kernel cache (in MB)",
diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py
index ee464161..aa635fc5 100644
--- a/hyperion/np/np_model.py
+++ b/hyperion/np/np_model.py
@@ -99,6 +99,8 @@ def _save_params_from_dict(self, f, params, dtypes=None):
         """
         if dtypes is None:
             dtypes = dict((k, float_save()) for k in params)
+        elif isinstance(dtypes, type):
+            dtypes = dict((k, dtypes) for k in params)
 
         if self.name is None:
             prefix = ""
@@ -174,6 +176,9 @@ def _load_params_to_dict(f, name, params, dtypes=None):
         """
         if dtypes is None:
             dtypes = dict((k, float_cpu()) for k in params)
+        elif isinstance(dtypes, type):
+            dtypes = dict((k, dtypes) for k in params)
+
         if name is None:
             prefix = ""
         else:
diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py
index 73255a24..8de700c4 100644
--- a/hyperion/torch/layer_blocks/res2net_blocks.py
+++ b/hyperion/torch/layer_blocks/res2net_blocks.py
@@ -410,9 +410,6 @@ def forward(self, x, x_mask=None):
 
             x += residual
 
-        if not self.norm_before:
-            x = self.bn3(x)
-
         if self.dropout_rate > 0:
             x = self.dropout(x)
 
diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py
index a9a9d98f..eddf47a7 100644
--- a/hyperion/torch/trainers/xvector_trainer.py
+++ b/hyperion/torch/trainers/xvector_trainer.py
@@ -88,35 +88,6 @@ def __init__(
         super_args = filter_func_args(super().__init__, locals())
         super().__init__(**super_args)
 
-        # super().__init__(
-        #     model,
-        #     loss,
-        #     optim,
-        #     epochs,
-        #     exp_path,
-        #     cur_epoch=cur_epoch,
-        #     grad_acc_steps=grad_acc_steps,
-        #     eff_batch_size=eff_batch_size,
-        #     device=device,
-        #     metrics=metrics,
-        #     lrsched=lrsched,
-        #     loggers=loggers,
-        #     ddp=ddp,
-        #     ddp_type=ddp_type,
-        #     train_mode=train_mode,
-        #     use_amp=use_amp,
-        #     log_interval=log_interval,
-        #     use_tensorboard=use_tensorboard,
-        #     use_wandb=use_wandb,
-        #     wandb=wandb,
-        #     grad_clip=grad_clip,
-        #     grad_clip_norm=grad_clip_norm,
-        #     swa_start=swa_start,
-        #     swa_lr=swa_lr,
-        #     swa_anneal_epochs=swa_anneal_epochs,
-        #     cpu_offload=cpu_offload,
-        # )
-
     @record
     def train_epoch(self, data_loader):
         """Training epoch loop

From 9c28408d396340a4eb59086bcf62197b9887f900 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-64-127.ec2.internal>
Date: Sat, 13 May 2023 02:29:16 +0000
Subject: [PATCH 20/89] update lid configs and np.str to str

---
 ...c2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml | 65 ++++++++++++++++++
 ...c2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml | 65 ++++++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml | 67 +++++++++++++++++++
 ...ec2xlsr300m_ecapatdnn1024x3_subcenter.yaml | 44 ++++++++++++
 ...r300m_ecapatdnn1024x3_subcenter_do0.2.yaml | 44 ++++++++++++
 .../v1/global_conf/config_lid_v3.0_13langs.sh | 44 ++++++++++++
 .../v1/global_conf/config_lid_v4.0_13langs.sh | 44 ++++++++++++
 .../v1/global_conf/config_lid_v4.1_13langs.sh | 44 ++++++++++++
 egs/commonvoice/v1/run_011_train_asr.sh       |  9 ++-
 hyperion/torch/data/audio_dataset.py          |  2 +-
 .../data/class_weighted_seg_chunk_sampler.py  |  4 +-
 .../torch/narchs/rnn_transducer_decoder.py    | 20 ++++--
 hyperion/utils/class_info.py                  |  2 +-
 hyperion/utils/info_table.py                  |  4 +-
 hyperion/utils/scp_list.py                    |  2 +-
 hyperion/utils/utt2info.py                    |  2 +-
 16 files changed, 449 insertions(+), 13 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
new file mode 100644
index 00000000..56e08794
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
@@ -0,0 +1,65 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.15
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    #decay_steps: 4200
+    #hold_steps: 1500
+    decay_steps: 16000
+    hold_steps: 32000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 100
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml
new file mode 100644
index 00000000..cf1a549f
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml
@@ -0,0 +1,65 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.1
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    #decay_steps: 4200
+    #hold_steps: 1500
+    decay_steps: 16000
+    hold_steps: 32000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 100
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml
new file mode 100644
index 00000000..d409fb47
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml
@@ -0,0 +1,67 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - language
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+      drop_last: false
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+      drop_last: false
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.003
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 30000
+    hold_steps: 16000
+    min_lr: 4e-5
+    warmup_steps: 3000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
new file mode 100644
index 00000000..27132c2d
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 1024
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 1024
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 8
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 3072
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 128
+  loss_type: subcenter-arc-softmax
+  num_subcenters: 2
+  cos_scale: 32.0
+  margin: 0.
+  margin_warmup_epochs: 5
+  intertop_margin: 0.
+  dropout_rate: 0.3
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
new file mode 100644
index 00000000..63c914e3
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 1024
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 1024
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 8
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 3072
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 128
+  loss_type: subcenter-arc-softmax
+  num_subcenters: 2
+  cos_scale: 32.0
+  margin: 0.
+  margin_warmup_epochs: 5
+  intertop_margin: 0.
+  dropout_rate: 0.2
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh
new file mode 100644
index 00000000..40516709
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v3.0_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0014.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v3.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v3.0_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
new file mode 100644
index 00000000..e6c3afda
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0014.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh
new file mode 100644
index 00000000..7d0ed120
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v4.1_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0014.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v4.1_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh
index e79de7af..284a68f5 100755
--- a/egs/commonvoice/v1/run_011_train_asr.sh
+++ b/egs/commonvoice/v1/run_011_train_asr.sh
@@ -18,7 +18,7 @@ set -e
 #export CONV_RSH=ssh
 #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
 
-
+export CUDA_VISIBLE_DEVICES=0,1
 stage=1
 ngpu=2
 config_file=default_config.sh
@@ -89,19 +89,24 @@ if [ $stage -le 2 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    finetune_wav2vec2transducer.py $nnet_type \
+    finetune_wav2vec2rnn_transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
     --in-model-file $nnet_s1 \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1236 \
     --num-gpus $ngpu
   
 fi
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 2b1f1cf6..9ffb964d 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -80,7 +80,7 @@ def __init__(
 
             time_durs = SegmentSet.load(time_durs_file)
             self.seg_set["duration"] = time_durs.loc[
-                self.seg_set["id"]].class_id.values.astype(np.float,
+                self.seg_set["id"]].class_id.values.astype(float,
                                                            copy=False)
         else:
             assert "duration" in self.seg_set
diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
index b551f342..8ec63b6f 100644
--- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
+++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
@@ -204,7 +204,8 @@ def _set_class_weights(self):
             self.class_info.set_uniform_weights()
         elif self.weight_mode == "data-prior":
             weights = self.class_info["total_duration"].values
-            self.class_info.set_weights(self, weights)
+            logging.info(weights)
+            self.class_info.set_weights(weights)
 
         if self.weight_exponent != 1.0:
             self.class_info.exp_weights(self.weight_exponent)
@@ -216,6 +217,7 @@ def _set_class_weights(self):
         self.var_weights = np.any(
             self.seg_set[self.length_name] < self.max_chunk_length
         )
+        logging.info(f'updated weight:{self.class_info["weights"]}')
 
     @property
     def hard_prototype_mining(self):
diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py
index bf9189ee..efc11113 100644
--- a/hyperion/torch/narchs/rnn_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_transducer_decoder.py
@@ -4,13 +4,14 @@
 """
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+import logging
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torchaudio
 import torchaudio.functional
-from jsonargparse import ActionParser, ArgumentParser
+from jsonargparse import ActionParser, ArgumentParser, ActionYesNo
 
 try:
     import k2
@@ -74,6 +75,7 @@ def __init__(
         am_scale: float = 0.0,
         simple_loss_scale: float = 0.5,
         pruned_warmup_steps: int = 2000,
+        # film: bool=False,
     ):
 
         super().__init__()
@@ -615,10 +617,13 @@ def change_config(
         override_dropouts=False,
         embed_dropout_rate: float = 0.0,
         rnn_dropout_rate: float = 0.0,
+        prune_range: Optional[int] = None,
     ):
         logging.info("changing decoder config")
         self.predictor.change_config(override_dropouts, embed_dropout_rate,
                                      rnn_dropout_rate)
+        if prune_range is not None:
+            self.prune_range = prune_range
 
     @staticmethod
     def filter_args(**kwargs):
@@ -751,8 +756,8 @@ def add_class_args(parser,
             help="""type of reduction for rnn-t loss between sum or mean""")
         parser.add_argument(
             "--prune-range",
-            default=5,
-            type=int,
+            default=None,
+            type=Optional[int],
             help="""how many symbols to keep for each frame in k2 rnn-t 
             pruned loss.""")
         parser.add_argument(
@@ -804,6 +809,13 @@ def add_finetune_args(parser, prefix=None, skip=set()):
                             type=float,
                             help=("dropout prob for decoder RNN "))
 
+        parser.add_argument(
+            "--prune-range",
+            default=5,
+            type=int,
+            help="""how many symbols to keep for each frame in k2 rnn-t 
+            pruned loss.""")
+
         if prefix is not None:
             outer_parser.add_argument("--" + prefix,
                                       action=ActionParser(parser=parser))
diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py
index 9e158d87..4e10fac2 100644
--- a/hyperion/utils/class_info.py
+++ b/hyperion/utils/class_info.py
@@ -66,7 +66,7 @@ def load(cls, file_path, sep=None):
         if ext == "":
             # if no extension we load as kaldi utt2spk file
             df = pd.read_csv(
-                file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str},
+                file_path, sep=" ", header=None, names=["id"], dtype={"id": str},
             )
             return cls(df)
 
diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py
index f76ba6af..5db7393e 100644
--- a/hyperion/utils/info_table.py
+++ b/hyperion/utils/info_table.py
@@ -138,8 +138,8 @@ def load(cls, file_path, sep=None, name="class_id"):
                 header=None,
                 names=["id", name],
                 dtype={
-                    "id": np.str,
-                    name: np.str
+                    "id": str,
+                    name: str
                 },
             )
         else:
diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py
index 5abf76f2..537102b4 100644
--- a/hyperion/utils/scp_list.py
+++ b/hyperion/utils/scp_list.py
@@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None):
     def validate(self):
         """Validates the attributes of the SCPList object."""
         self.key = list2ndarray(self.key)
-        self.file_path = list2ndarray(self.file_path, dtype=np.object)
+        self.file_path = list2ndarray(self.file_path, dtype=object)
         assert len(self.key) == len(self.file_path)
         if self.offset is not None:
             if isinstance(self.offset, list):
diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py
index edf2c23a..e0d27e64 100644
--- a/hyperion/utils/utt2info.py
+++ b/hyperion/utils/utt2info.py
@@ -142,7 +142,7 @@ def save(self, file_path, sep=" "):
         self.utt_info.to_csv(file_path, sep=sep, header=False, index=False)
 
     @classmethod
-    def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}):
+    def load(cls, file_path, sep=" ", dtype={0: str, 1: str}):
         """Loads utt2info list from text file.
 
         Args:

From 7f43376d4976c00b885a66463475714beb90053e Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Sat, 13 May 2023 14:33:07 -0400
Subject: [PATCH 21/89] FiLM transducer

---
 ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml |  86 ++
 ...g_pruned_filmed_transducer_v1.0_13langs.sh |  50 ++
 egs/commonvoice/v1/run_015_train_film_asr.sh  | 142 +++
 .../bin/train_wav2vec2rnn_film_transducer.py  | 278 ++++++
 hyperion/torch/layer_blocks/__init__.py       |   5 +
 hyperion/torch/layer_blocks/film_blocks.py    |  88 ++
 .../layer_blocks/transducer_film_joiner.py    |  79 ++
 .../layer_blocks/transducer_film_predictor.py | 128 +++
 hyperion/torch/models/__init__.py             |   3 +-
 hyperion/torch/models/transducer/__init__.py  |   1 +
 .../models/transducer/rnn_film_transducer.py  | 255 ++++++
 .../torch/models/wav2transducer/__init__.py   |   1 +
 .../hf_wav2rnn_film_transducer.py             | 372 ++++++++
 .../hf_wav2vec2rnn_film_transducer.py         | 145 +++
 .../hf_wav2rnn_transducer_languageid.py       | 122 +--
 hyperion/torch/narchs/__init__.py             |   1 +
 .../narchs/rnn_film_transducer_decoder.py     | 843 ++++++++++++++++++
 17 files changed, 2542 insertions(+), 57 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
 create mode 100755 egs/commonvoice/v1/run_015_train_film_asr.sh
 create mode 100755 hyperion/bin/train_wav2vec2rnn_film_transducer.py
 create mode 100644 hyperion/torch/layer_blocks/film_blocks.py
 create mode 100644 hyperion/torch/layer_blocks/transducer_film_joiner.py
 create mode 100644 hyperion/torch/layer_blocks/transducer_film_predictor.py
 create mode 100644 hyperion/torch/models/transducer/rnn_film_transducer.py
 create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
 create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
 create mode 100644 hyperion/torch/narchs/rnn_film_transducer_decoder.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
new file mode 100644
index 00000000..7e059b3b
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
@@ -0,0 +1,86 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
new file mode 100644
index 00000000..1fc49fdd
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
@@ -0,0 +1,50 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe
+nnet_s2_transducer_name=$nnet_transducer_name.s2
+nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name
+nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0016.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh
new file mode 100755
index 00000000..ba1197a8
--- /dev/null
+++ b/egs/commonvoice/v1/run_015_train_film_asr.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+# export CUDA_VISIBLE_DEVICES=0
+
+#ml purge
+#module load namd/2.14-cuda-smp
+#module load cuda/11.6.0
+#ml
+#nvidia-smi
+#export CUDA_VISIBLE_DEVICES=0,1,2,3
+#export CONV_RSH=ssh
+#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
+
+# export CUDA_VISIBLE_DEVICES=0,1
+stage=1
+ngpu=1
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_dir=data/${nnet_data}/
+val_dir=data/${dev_data}/
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+    extra_args="--data.val.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+if [ "$use_wandb" == "true" ];then
+  extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)"
+fi
+
+
+# Network Training
+if [ $stage -le 1 ]; then
+
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+    train_wav2vec2rnn_film_transducer.py $nnet_type \
+    --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s1_dir $args \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --in-model-file $nnet_rnn_transducer \
+    --num-gpus $ngpu
+
+fi
+
+if [ $stage -le 2 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2rnn_transducer.py $nnet_type \
+    --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s2_dir $args \
+    --in-model-file $nnet_s1 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+    # --master-port 1236 \
+  
+fi
+
+if [ $stage -le 3 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  
+
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    
+    .py $nnet_type \
+    --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s3_dir $args \
+    --in-model-file $nnet_s2 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+fi
+
diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
new file mode 100755
index 00000000..0239820f
--- /dev/null
+++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer,
+                                   HFWav2Vec2RNNFiLMTransducer)
+from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+model_dict = {
+    "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
+    "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer,
+    "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer,
+    # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer,
+    # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer,
+    # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer,
+    # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer,
+}
+
+
+def transducer_collate(batch):
+    audio = []
+    audio_length = []
+    target = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        target.append(record["text"])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    # FiLM: add language ID to the input
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": target,
+        "language": language,
+    }
+    return batch
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = ({
+        "num_workers": num_workers_per_gpu,
+        "pin_memory": True
+    } if num_gpus > 0 else {})
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=sampler,
+                                              **largs,
+                                              collate_fn=transducer_collate)
+    return data_loader
+
+
+# def init_model_from_transducer(in_model_file, rank, model_class, **kwargs):
+#     model_args = model_class.filter_finetune_args(**kwargs["model"])
+#     # model_args = model_class.filter_args(**kwargs["model"])
+#     if rank == 0:
+#         logging.info("model network ft args={}".format(model_args))
+#     model = TML.load(in_model_file)
+#     model.change_config(**model_args)
+#     if rank == 0:
+#         logging.info("model={}".format(model))
+#     return model
+
+
+def init_model(blank_id, vocab_size, rank, model_class, **kwargs):
+    model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network args={}".format(model_args))
+    # TODO: check model_args
+    model_args["transducer"]["decoder"]["blank_id"] = blank_id
+    model_args["transducer"]["decoder"]["vocab_size"] = vocab_size
+    model = model_class(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+    #torch.backends.cudnn.deterministic = True
+    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.enabled = False
+
+    # ddp_args = ddp.filter_ddp_args(**kwargs)
+    # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    # kwargs["rank"] = rank
+
+    # for Debug
+    rank = 0
+    kwargs["rank"] = 0
+    device = "cpu"
+    world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    # model = init_model_from_transducer(**kwargs)
+    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
+                       train_loader.dataset.sp.get_piece_size(), **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {}  #{"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train",
+                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str,
+    )
+
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str,
+    )
+
+    parser.link_arguments("data.train.data_loader.num_workers",
+                          "data.val.data_loader.num_workers")
+
+    parser.link_arguments("data.train.dataset.bpe_model",
+                          "data.val.dataset.bpe_model")
+
+    parser.add_argument("--in-model-file", required=True)
+    model_class.add_class_args(parser, prefix="model")
+
+    Trainer.add_class_args(parser,
+                           prefix="trainer",
+                           train_modes=model_class.valid_train_modes())
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1123581321,
+                        help="random seed")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Train Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py
index 0487ae4f..6e2f1eb9 100644
--- a/hyperion/torch/layer_blocks/__init__.py
+++ b/hyperion/torch/layer_blocks/__init__.py
@@ -9,6 +9,7 @@
 from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock
 from .etdnn_blocks import ETDNNBlock
 from .fc_blocks import FCBlock
+from .film_blocks import FiLM, LSTMWithFiLM, initialize_lstm_with_film
 from .mbconv_blocks import MBConvBlock, MBConvInOutBlock
 from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock
 from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock
@@ -32,6 +33,10 @@
 from .tdnn_blocks import TDNNBlock
 from .transducer_joiner import TransducerJoiner
 from .transducer_predictor import TransducerRNNPredictor, TransducerConvPredictor
+
+from .transducer_film_joiner import TransducerFiLMJoiner 
+from .transducer_film_predictor import TransducerRNNFiLMPredictor
+
 from .transformer_conv2d_subsampler import TransformerConv2dSubsampler
 from .transformer_encoder_v1 import TransformerEncoderBlockV1
 from .transformer_feedforward import (Conv1dLinear, Conv1dx2,
diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
new file mode 100644
index 00000000..8370a42b
--- /dev/null
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+
+class FiLM(nn.Module):
+    def __init__(self, input_size, condition_size):
+        # condition_size: the size of the language id vector
+        # input_size: the size of the RNN input to the FiLM layer
+        super(FiLM, self).__init__()
+        self.linear_scale = nn.Linear(condition_size, input_size)
+        self.linear_shift = nn.Linear(condition_size, input_size)
+
+    def forward(self, x, condition):
+        gamma = self.linear_scale(condition).unsqueeze(2).expand_as(x)
+        beta = self.linear_shift(condition).unsqueeze(2).expand_as(x)
+        x = x * gamma + beta
+        return x
+
+
+
+class LSTMWithFiLM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True):
+        super(LSTMWithFiLM, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.batch_first = batch_first
+
+        self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
+        self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
+        self.dropout_layer = nn.Dropout(dropout)
+
+    def forward(self, x, states, condition):
+        outputs = []
+        h, c = states
+        new_h, new_c = [], []
+        for i, (lstm, film) in enumerate(zip(self.lstms, self.films)):
+            x, (h_i, c_i) = lstm(x, (h[i].unsqueeze(0), c[i].unsqueeze(0)))
+            x = film(x, condition)
+            new_h.append(h_i)
+            new_c.append(c_i)
+            if i != self.num_layers - 1:
+                x = self.dropout_layer(x)
+            outputs.append(x)
+        new_h = torch.cat(new_h, dim=0)
+        new_c = torch.cat(new_c, dim=0)
+        return torch.cat(outputs, dim=0), (new_h, new_c)
+
+
+
+def initialize_lstm_with_film(lstm_with_film, pretrained_dict):
+    # Load pretrained LSTM state_dict
+    pretrained_lstm = pretrained_dict['lstm']
+    pretrained_num_layers = pretrained_dict['num_layers']
+
+    # Copy weights from pretrained LSTM layers to LSTMWithFiLM
+    for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)):
+        if i < pretrained_num_layers:
+            lstm.weight_ih_l0.data.copy_(pretrained_lstm['weight_ih_l' + str(i)])
+            lstm.weight_hh_l0.data.copy_(pretrained_lstm['weight_hh_l' + str(i)])
+            lstm.bias_ih_l0.data.copy_(pretrained_lstm['bias_ih_l' + str(i)])
+            lstm.bias_hh_l0.data.copy_(pretrained_lstm['bias_hh_l' + str(i)])
+        else:
+            # For extra layers in LSTMWithFiLM, just reset the weights
+            nn.init.xavier_uniform_(lstm.weight_ih_l0)
+            nn.init.orthogonal_(lstm.weight_hh_l0)
+            nn.init.zeros_(lstm.bias_ih_l0)
+            nn.init.zeros_(lstm.bias_hh_l0)
+
+
+# def initialize_lstm_with_film(lstm_with_film, pretrained_lstm):
+#     # Copy weights from pretrained LSTM layers to LSTMWithFiLM
+#     for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)):
+#         if i < pretrained_lstm.num_layers:
+#             lstm.weight_ih_l0.data.copy_(pretrained_lstm.weight_ih_l[i])
+#             lstm.weight_hh_l0.data.copy_(pretrained_lstm.weight_hh_l[i])
+#             lstm.bias_ih_l0.data.copy_(pretrained_lstm.bias_ih_l[i])
+#             lstm.bias_hh_l0.data.copy_(pretrained_lstm.bias_hh_l[i])
+#         else:
+#             # For extra layers in LSTMWithFiLM, just reset the weights
+#             nn.init.xavier_uniform_(lstm.weight_ih_l0)
+#             nn.init.orthogonal_(lstm.weight_hh_l0)
+#             nn.init.zeros_(lstm.bias_ih_l0)
+#             nn.init.zeros_(lstm.bias_hh_l0)
+
+
+
+    # rnn = LSTMWithFiLM(embed_dim, hid_feats, num_layers, rnn_dropout_rate, batch_first=True)
\ No newline at end of file
diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py
new file mode 100644
index 00000000..22875258
--- /dev/null
+++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py
@@ -0,0 +1,79 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba, Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from .film_blocks import FiLM
+
+
+class TransducerFiLMJoiner(nn.Module):
+    """ RNN-T Joiner network.
+    Implementation based on 
+    https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer_stateless7/joiner.py
+
+    Attributes:
+      in_feats: input feature dimension.
+      vocab_size: vocabulary size
+    """
+
+    def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int):
+        
+        super().__init__()
+        self.enc_feats = enc_feats
+        self.pred_feats = pred_feats
+        self.hid_feats = hid_feats
+        self.vocab_size = vocab_size
+
+        self.enc_proj = nn.Linear(enc_feats, hid_feats)
+        self.pred_proj = nn.Linear(pred_feats, hid_feats)
+        self.output = nn.Linear(hid_feats, vocab_size)
+
+        self.FiLM_encoder = FiLM(hid_feats, condition_size)
+        self.FiLM_joiner = FiLM(hid_feats, condition_size)
+        
+    def get_config(self):
+        config = {
+            "joiner_type": "basic",
+            "hid_feats": self.hid_feats,
+        }
+        return config
+
+    def forward(self,
+            enc_out: torch.Tensor,
+            pred_out: torch.Tensor,
+            condition: torch.Tensor, 
+            project_input: bool = True) -> torch.Tensor:
+        
+        """
+        Args:
+          enc_out: output from the encoder with shape = (N, T, C) or (N, T, s_range, C)
+          pred_out: output from the predictor with shape = (N, U, C) or (N, T, s_range, C)
+          project_input: if True projects the encoder and predictor features 
+            in the forward founction, if False it expects them outside.
+        Returns:
+          Symbols' logits of shape (N, T, U, C).
+        """
+        assert enc_out.ndim == pred_out.ndim
+        assert enc_out.ndim in (3, 4)
+
+        if enc_out.ndim == 3:
+            enc_out = enc_out.unsqueeze(2)  # (N, T, 1, C)
+            pred_out = pred_out.unsqueeze(1)  # (N, 1, U, C)
+        
+        enc_out = self.FiLM_encoder(enc_out, condition)
+
+        if project_input:
+            x = self.enc_proj(enc_out) + self.pred_proj(pred_out)
+        else:
+            x = enc_out + pred_out
+
+        x = self.FiLM_joiner(x, condition)
+        
+        x = torch.tanh(x)
+        logits = self.output(x)
+        return logits
diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py
new file mode 100644
index 00000000..09fae3ec
--- /dev/null
+++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py
@@ -0,0 +1,128 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba, Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import logging
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+
+from ...utils.misc import filter_func_args
+from ..layers import ActivationFactory as AF
+from .film_blocks import FiLM, LSTMWithFiLM
+
+class TransducerRNNFiLMPredictor(nn.Module):
+    """ RNN-T prediction network with LSTM or GRU
+    Attributes:
+      vocab_size: Number of tokens of the modeling unit including blank.
+      embed_dim: Dimension of the input embedding.
+      num_layers: Number of LSTM layers.
+      hid_feats: Hidden dimension of LSTM layers.
+      out_feats: Output dimension of the predictor.
+      embed_dropout_rate: Dropout rate for the embedding layer.
+      rnn_dropout_rate: Dropout for LSTM layers.
+      rnn_type: between lstm and gru
+      blank_id: The ID of the blank symbol.           
+    """
+
+    def __init__(self,
+                 vocab_size: int,
+                 embed_dim: int,
+                 num_layers: int,
+                 hid_feats: int,
+                 condition_size: int,
+                 out_feats: Optional[int] = None,
+                 embed_dropout_rate: float = 0.0,
+                 rnn_dropout_rate: float = 0.0,
+                 rnn_type: str = "lstm",
+                 blank_id: int = 0):
+        super().__init__()
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=embed_dim,
+            padding_idx=blank_id,
+        )
+        self.embed_dropout = nn.Dropout(embed_dropout_rate)
+        if rnn_type == "lstm":
+            self.rnn = LSTMWithFiLM(
+                input_size=embed_dim,
+                hidden_size=hid_feats,
+                num_layers=num_layers,
+                dropout=rnn_dropout_rate,
+                condition_size=condition_size,
+                batch_first=True,
+            )
+        else:
+            raise Exception(f"Unknown RNN type {rnn_type}")
+
+        self.out_feats = out_feats
+        self.blank_id = blank_id
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.hid_feats = hid_feats
+        self.embed_dropout_rate = embed_dropout_rate
+        self.rnn_dropout_rate = rnn_dropout_rate
+        if out_feats is None:
+            out_feats = hid_feats
+
+        self.out_feats = out_feats
+        if out_feats != hid_feats:
+            self.output_proj = nn.Linear(hid_feats, out_feats)
+        else:
+            self.output_proj = None
+
+    def get_config(self):
+        config = {
+            "pred_type": "conv",
+            "vocab_size": self.vocab_size,
+            "embed_dim": self.embed_dim,
+            "num_layers": self.num_layers,
+            "hid_feats": self.hid_feats,
+            "out_feats": self.out_feats,
+            "embed_dropout_rate": self.embed_dropout_rate,
+            "rnn_dropout_rate": self.rnn_dropout_rate,
+            "rnn_type": self.rnn_type,
+            "blank_id": self.blank_id,
+        }
+        return config
+
+    def forward(
+        self,
+        y: torch.Tensor,
+        condition: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Args: 
+          y: previous y_{<t} tensor of shape (N, U) with <sos> prepended.
+          states: tuple of tensors containing RNN layers states
+        Returns:
+          - rnn_output, a tensor of shape (N, U, C)
+          - (h, c), containing the states i for RNN layers with shape (num_layers, N, C).
+        """
+        embed = self.embedding(y)
+        embed = self.embed_dropout(embed)
+        out, (h, c) = self.rnn(embed, states, condition)
+        if self.output_proj:
+            out = self.output_proj(out)
+
+        return out, (h, c)
+
+    def change_config(
+        self,
+        override_dropouts=False,
+        embed_dropout_rate: float = 0.0,
+        rnn_dropout_rate: float = 0.0,
+    ):
+        logging.info("changing decoder config")
+
+        if override_dropouts:
+            logging.info("overriding decoder dropouts")
+            self.rnn_dropout_rate = rnn_dropout_rate
+            self.rnn.p = self.rnn_dropout_rate
+            self.embed_dropout_rate = embed_dropout_rate
+            self.embed_dropout = nn.Dropout(self.embed_dropout_rate)
diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py
index a8bb24d5..419ea742 100644
--- a/hyperion/torch/models/__init__.py
+++ b/hyperion/torch/models/__init__.py
@@ -9,7 +9,8 @@
 from .transducer import RNNTransducer, RNNRNNTransducer
 from .wav2languageid import HFWav2Vec2ResNet1dLanguageID
 from .wav2transducer import (HFWav2Vec2RNNRNNTransducer,
-                             HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer)
+                             HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer,
+                             HFWav2Vec2RNNFiLMTransducer)
 from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector,
                            HFWavLM2ResNet1dXVector)
 from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py
index ee3c85f5..9d860a22 100644
--- a/hyperion/torch/models/transducer/__init__.py
+++ b/hyperion/torch/models/transducer/__init__.py
@@ -6,6 +6,7 @@
 
 from .rnn_rnn_transducer import RNNRNNTransducer
 from .rnn_transducer import RNNTransducer, RNNTransducerOutput
+from .rnn_film_transducer import RNNFiLMTransducer
 from .transducer import Transducer
 
 #from .conformer import Conformer
diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py
new file mode 100644
index 00000000..0e8c2889
--- /dev/null
+++ b/hyperion/torch/models/transducer/rnn_film_transducer.py
@@ -0,0 +1,255 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+
+try:
+    import k2
+except ModuleNotFoundError:
+    from ...utils import dummy_k2 as k2
+
+import torch
+
+from ....utils import HypDataClass
+from ....utils.misc import filter_func_args
+from ...narchs import RNNFiLMTransducerDecoder
+from ...torch_model import TorchModel
+
+
+@dataclass
+class RNNTransducerOutput(HypDataClass):
+
+    loss: torch.Tensor
+    loss_simple: Optional[torch.Tensor] = None
+    loss_pruned: Optional[torch.Tensor] = None
+    h_feats: Optional[List[torch.Tensor]] = None
+
+
+class RNNFiLMTransducer(TorchModel):
+    """ Base-class for RNN-T in
+    "Sequence Transduction with Recurrent Neural Networks"
+    https://arxiv.org/pdf/1211.3711.pdf
+
+    Attributes:
+      encoder: Encoder network module
+      decoder: RNN-T Decoder config. dictionary or module.
+    """
+
+    def __init__(
+        self,
+        encoder: Union[TorchModel, None],
+        decoder: Union[Dict, RNNFiLMTransducerDecoder],
+    ):
+        super().__init__()
+        if encoder is not None:
+            assert isinstance(encoder, TorchModel)
+        if isinstance(decoder, dict):
+            decoder = RNNFiLMTransducerDecoder(**decoder)
+        else:
+            assert isinstance(decoder, RNNFiLMTransducerDecoder)
+
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        y: Union[Dict, k2.RaggedTensor],
+        lang: torch.Tensor,
+    ) -> RNNTransducerOutput:
+        """
+        Args:
+          x: input features with shape = (N, T, C)
+          x_lengths: feature number for frames with shape = (N,)
+          y: ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+        Returns:
+          - Token logits with shape = (N, vocab_size)
+          - RNN-T loss.
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lengths.ndim == 1, x_lengths.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lengths.size(0) == y.dim0
+        assert torch.all(
+            x_lengths[:-1] >= x_lengths[1:]
+        ), f"x_lengths={x_lengths}"  # check x_lengths are sorted
+        assert lang.size(0) == y.dim0
+        assert lang.size(1) == 1
+
+        if self.encoder is not None:
+            x, x_lengths = self.encoder(x, x_lengths)
+            assert torch.all(x_lengths > 0)
+
+        dec_output = self.decoder(x, x_lengths, y, lang)
+        output = RNNTransducerOutput(*dec_output)
+        return output
+
+    def infer(self,
+              x: torch.Tensor,
+              x_lengths: torch.Tensor,
+              lang: torch.Tensor,
+              decoding_method="time_sync_beam_search",
+              beam_width: int = 5,
+              max_sym_per_frame: int = 3,
+              max_sym_per_utt: int = 1000) -> List[List[int]]:
+        """
+        ASR tokens inference
+        Args:
+          x: input features with shape = (N, T, C)
+          x_lengths: feature number for frames with shape = (N,)
+          decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
+          max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
+          max_sym_per_utt: maximimum number of symbols in a single utterance.
+        Returns:
+          List of list of integer indexes of the recognizer's symbols.
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lengths.ndim == 1, x_lengths.shape
+        assert x.size(0) == x_lengths.size(0)
+
+        if self.encoder is not None:
+            x, x_lengths = self.encoder(x, x_lengths)
+            assert torch.all(x_lengths > 0)
+
+        batch_size = x.size(0)
+        y = []
+        for i in range(batch_size):
+            x_i = x[i:i + 1, :x_lengths[i]]
+            y_i = self.decoder.decode(x_i,
+                                      lang,
+                                      method=decoding_method,
+                                      beam_width=beam_width,
+                                      max_sym_per_frame=max_sym_per_frame,
+                                      max_sym_per_utt=max_sym_per_utt)
+            y.append(y_i)
+
+        return y
+
+    def set_train_mode(self, mode):
+        if mode == self._train_mode:
+            return
+
+        if mode == "full":
+            self.unfreeze()
+        elif mode == "frozen":
+            self.freeze()
+        else:
+            raise ValueError(f"invalid train_mode={mode}")
+
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+        if train_mode in ["full", "frozen"]:
+            super()._train(train_mode)
+        else:
+            raise ValueError(f"invalid train_mode={train_mode}")
+
+    @staticmethod
+    def valid_train_modes():
+        return ["full", "frozen"]
+
+    def get_config(self):
+        if self.encoder is None:
+            enc_cfg = None
+        else:
+            enc_cfg = self.encoder.get_config()
+            del enc_cfg["class_name"]
+
+        dec_cfg = self.decoder.get_config()
+        del dec_cfg["class_name"]
+        config = {
+            "encoder": enc_cfg,
+            "decoder": dec_cfg,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    @staticmethod
+    def filter_args(**kwargs):
+        # get arguments for pooling
+        args = {}
+        decoder_args = RNNFiLMTransducerDecoder.filter_args(**kwargs["decoder"])
+        args["decoder"] = decoder_args
+        return args
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        RNNFiLMTransducerDecoder.add_class_args(parser, prefix="decoder")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    def change_config(
+        self,
+        decoder: Dict,
+    ):
+        logging.info("changing decoder config")
+        self.decoder.change_config(**decoder)
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        args = {}
+        decoder_args = RNNFiLMTransducerDecoder.filter_finetune_args(**kwargs["decoder"])
+        args["decoder"] = decoder_args
+        return args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        RNNFiLMTransducerDecoder.add_finetune_args(parser, prefix="decoder")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def add_infer_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument("--decoding-method",
+                            default="time_sync_beam_search",
+                            choices=[
+                                "greedy", "time_sync_beam_search",
+                                "align_length_sync_beam_search"
+                            ])
+
+        parser.add_argument("--beam-width",
+                            default=5,
+                            type=int,
+                            help="beam width for beam search")
+        parser.add_argument("--max-sym-per-frame",
+                            default=3,
+                            type=int,
+                            help="max symbols RNN-T can emit in 1 frame")
+        parser.add_argument("--max-sym-per-utt",
+                            default=1000,
+                            type=int,
+                            help="max symbols RNN-T can emit in 1 frame")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_infer_args(**kwargs):
+        return filter_func_args(RNNTransducer.infer, kwargs)
diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py
index 79af6349..cd446982 100644
--- a/hyperion/torch/models/wav2transducer/__init__.py
+++ b/hyperion/torch/models/wav2transducer/__init__.py
@@ -7,3 +7,4 @@
 from .hf_wav2vec2_transducer import HFWav2Vec2Transducer
 from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer
 from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer
+from .hf_wav2vec2rnn_film_transducer import HFWav2Vec2RNNFiLMTransducer
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
new file mode 100644
index 00000000..48d8084b
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -0,0 +1,372 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import contextlib
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ...torch_model import TorchModel
+from ...utils import remove_silence
+from ..transducer import RNNFiLMTransducer
+
+
+class HFWav2RNNFiLMTransducer(TorchModel):
+    """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor.
+
+    Attributes:
+       hf_feats: hugging face model wrapper object.
+       transducer: transducer model object.
+       feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to
+                          the wav2vec "num_layers".
+       feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(self,
+                 hf_feats: TorchModel,
+                 transducer: Union[Dict, TorchModel],
+                 feat_fusion_start: int = 0,
+                 feat_fusion_method: str = "weighted-avg"):
+
+        super().__init__()
+        self.hf_feats = hf_feats
+        if isinstance(transducer, dict):
+            transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+            #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in transducer:
+                del transducer["class_name"]
+
+            transducer["encoder"] = None
+            transducer = RNNFiLMTransducer(**transducer)
+        else:
+            assert isinstance(transducer, RNNFiLMTransducer)
+            if transducer.encoder is None:
+                assert transducer.decoder.in_feats == hf_feats.hidden_size
+                #assert transducer.joiner.in_feats == hf_feats.hidden_size
+
+        self.transducer = transducer
+        self.feat_fusion_start = feat_fusion_start
+        self.feat_fusion_method = feat_fusion_method
+        self._hf_context = contextlib.nullcontext()
+        self._make_fuser()
+
+    def _make_fuser(self):
+        if self.feat_fusion_method == "last":
+            self.feat_fuser = None
+            return
+
+        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        layer_dim = self.hf_feats.hidden_size
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif self.feat_fusion_method == "linear":
+            self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
+            self.feat_fuser.weight.data = torch.ones(1,
+                                                     num_layers) / num_layers
+        elif self.feat_fusion_method == "cat":
+            self.feat_fuser = nn.Linear(num_layers * layer_dim,
+                                        layer_dim,
+                                        bias=False)
+
+    def _fuse_hid_feats(self, hid_feats):
+        """Fuses the hidden features from the Wav2Vec model.
+
+        Args:
+          hid_feats: list of hidden features Tensors from Wav2Vec model.
+
+        Returns:
+          Tensor of fused features (batch, channels, time)
+        """
+        if len(hid_feats) == 1:
+            # There is only one layer of features
+            return hid_feats[0]
+
+        hid_feats = hid_feats[self.feat_fusion_start:]
+        if self.feat_fusion_method == "weighted-avg":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method == "linear":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method == "cat":
+            hid_feats = torch.cat(hid_feats, dim=-1)
+            feats = self.feat_fuser(hid_feats)
+        elif self.feat_fusion_method == "last":
+            feats = hid_feats[-1]
+
+        return feats
+
+    def forward_feats(self,
+                      x,
+                      x_lengths,
+                      return_feat_layers=None,
+                      chunk_length=0,
+                      detach_chunks=False):
+        return_hid_states = (False if return_feat_layers is None
+                             and self.feat_fusion_method == "last" else True)
+        with self._hf_context:
+            hf_output = self.hf_feats(
+                x,
+                x_lengths,
+                return_hid_states=return_hid_states,
+                chunk_length=chunk_length,
+                detach_chunks=detach_chunks,
+            )
+        feat_lengths = hf_output["hidden_states_lengths"]
+        if return_hid_states:
+            hid_feats = hf_output["hidden_states"]
+            feats = self._fuse_hid_feats(hid_feats)
+        else:
+            hid_feats = None
+            feats = hf_output["last_hidden_state"]
+
+        feats = feats.transpose(1, 2)
+        if return_feat_layers is not None:
+            # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time)
+            # as the hidden features of the x-vector encoder.
+            hid_feats = [
+                f.transpose(1, 2) for i, f in enumerate(hid_feats)
+                if i in return_feat_layers
+            ]
+        else:
+            hid_feats = None
+
+        return feats, hid_feats, feat_lengths
+
+    def forward(
+        self,
+        x,
+        languageid,
+        x_lengths=None,
+        text=None,
+        return_feat_layers=None,
+        # return_enc_layers=None,
+        return_logits=True,
+    ):
+        """Forward function. If returns the logits posteriors of the classes.
+        It can also returns the hidden representations in the wav2vec feature extractor,
+        the x-vector encoder and the
+        classification head. In this case the ouput variable is a dictionary.
+
+        Args:
+          x: input features tensor with shape=(batch, in_feats, time)
+          x_lengths: time lengths of the features with shape=(batch,)
+          y: target classes torch.long tensor with shape=(batch,)
+          return_feat_layers: list of integers indicating, which wav2vec layers
+                             we should return. If None, no wav2vec layers are returned.
+          return_enc_layers: list of integers indicating, which encoder layers
+                             we should return. If None, no encoder layers are returned.
+          return_logits: if True, it adds the logits to the output dictionary.
+        Returns:
+          Dataclass with losses, "h_enc" (list of hidden encoder layers),
+          "h_feats" (wav2vec features)
+        """
+        feats, hid_feats, feat_lengths = self.forward_feats(
+            x, x_lengths, return_feat_layers)
+
+        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+        output = self.transducer(
+            feats,
+            feat_lengths,
+            text,
+            languageid,
+        )
+
+        if return_feat_layers:
+            output.h_feats = hid_feats
+
+        return output
+
+    def infer(self,
+              x: torch.Tensor,
+              x_lengths: torch.Tensor,
+              langugeid: torch.Tensor,
+              decoding_method="time_sync_beam_search",
+              beam_width: int = 5,
+              max_sym_per_frame: int = 3,
+              max_sym_per_utt: int = 1000):
+        """
+        ASR tokens inference
+        Args:
+          x: input features with shape = (N, T, C)
+          x_lengths: feature number for frames with shape = (N,)
+          decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
+          max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
+          max_sym_per_utt: maximimum number of symbols in a single utterance.
+        Returns:
+          List of list of integer indexes of the recognizer's symbols.
+        """
+
+        feats, _, feat_lengths = self.forward_feats(x, x_lengths)
+
+        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+
+        y = self.transducer.infer(feats,
+                                  feat_lengths,
+                                  langugeid,
+                                  decoding_method=decoding_method,
+                                  beam_width=beam_width,
+                                  max_sym_per_frame=max_sym_per_frame,
+                                  max_sym_per_utt=max_sym_per_utt)
+        return y
+
+    def freeze_feat_fuser(self):
+        if self.feat_fuser is None:
+            return
+
+        if self.feat_fusion_method == "weighted-avg":
+            self.feat_fuser.requires_grad = False
+            return
+
+        for param in self.feat_fuser.parameters():
+            param.requires_grad = False
+
+    def freeze_hf_feats(self):
+        self.hf_feats.freeze()
+
+    def freeze_hf_feature_encoder(self):
+        self.hf_feats.freeze_feature_encoder()
+
+    def set_train_mode(self, mode):
+        if mode == self._train_mode:
+            return
+
+        if mode == "full":
+            self.unfreeze()
+        elif mode == "frozen":
+            self.freeze()
+        elif mode in ["ft-transducer", "ft-transducer-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+            self.freeze_feat_fuser()
+        elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+        elif mode == "hf-feat-extractor-frozen":
+            self.unfreeze()
+            self.freeze_hf_feature_encoder()
+        else:
+            raise ValueError(f"invalid train_mode={mode}")
+
+        logging.info("train mode set to %s", mode)
+
+        if "nograd" in mode:
+            logging.info("using torch.no_grad for hf_feats")
+            self._hf_context = torch.no_grad()
+        else:
+            self._hf_context = contextlib.nullcontext()
+
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+
+        if train_mode in ["full", "frozen"]:
+            super()._train(train_mode)
+        elif train_mode in [
+                "ft-transducer",
+                "hf-feats-frozen",
+                "ft-transducer-nograd",
+                "hf-feats-frozen-nograd",
+                "hf-feat-extractor-frozen",
+        ]:
+            self.hf_feats.train()
+            self.transducer._train("full")
+        else:
+            raise ValueError(f"invalid train_mode={train_mode}")
+
+    @staticmethod
+    def valid_train_modes():
+        return [
+            "full",
+            "frozen",
+            "ft-embed-affine",
+            "ft-transducer",
+            "hf-feats-frozen",
+            "ft-transducer-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
+        ]
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = (
+            "hf_feats",
+            "transducer",
+            "feat_fusion_start",
+            "feat_fusion_method",
+        )
+        args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+        return args
+
+    def get_config(self):
+        hf_cfg = self.hf_feats.get_config()
+        tran_cfg = self.transducer.get_config()
+        del hf_cfg["class_name"]
+        del tran_cfg["class_name"]
+        config = {
+            "hf_feats": hf_cfg,
+            "transducer": tran_cfg,
+            "feat_fusion_start": self.feat_fusion_start,
+            "feat_fusion_method": self.feat_fusion_method,
+        }
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def change_config(self, hf_feats, transducer):
+        logging.info("changing hf wav2transducer config")
+        self.hf_feats.change_config(**hf_feats)
+        self.transducer.change_config(**transducer)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--feat-fusion-start",
+            default=0,
+            type=int,
+            help="""
+            the input to x-vector model will fuse the wav2vec 
+            layers from feat_fusion_start to
+            the wav2vec num_layers""",
+        )
+        parser.add_argument(
+            "--feat-fusion-method",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=("method to fuse the hidden layers from the wav2vec model "
+                  "in [weighted-avg, linear, cat, last]"),
+        )
+
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix,
+                action=ActionParser(parser=parser),
+            )
+
+    @staticmethod
+    def add_infer_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        RNNFiLMTransducer.add_infer_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_infer_args(**kwargs):
+        return RNNFiLMTransducer.filter_infer_args(**kwargs)
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
new file mode 100644
index 00000000..e76867bc
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
@@ -0,0 +1,145 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ...tpm import HFWav2Vec2
+from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer
+from ..transducer import RNNFiLMTransducer
+from ...layer_blocks import initialize_lstm_with_film
+
+class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer):
+    """Class for RNN-T with Wav2Vec2 features
+
+    Attributes:
+      Attributes:
+      hf_feats: HFWav2Vec configuration dictionary or object.
+                This is a warpper over Hugging Face Wav2Vec model.
+      transducer: Transducer configuration dictionary or object.
+      feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to
+                         the wav2vec "num_layers".
+      feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(
+        self,
+        hf_feats: Union[Dict, HFWav2Vec2],
+        transducer: Union[Dict, RNNFiLMTransducer],
+        feat_fusion_start: int = 0,
+        feat_fusion_method: str = "weighted-avg",
+    ):
+
+        if isinstance(hf_feats, dict):
+            if "class_name" in hf_feats:
+                del hf_feats["class_name"]
+            hf_feats = HFWav2Vec2(**hf_feats)
+        else:
+            assert isinstance(hf_feats, HFWav2Vec2)
+
+        # if isinstance(transducer, dict):
+        #     transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+        #     transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+        #     if "class_name" in transducer:
+        #         del transducer["class_name"]
+        #     transducer = Transducer(**transducer)
+        # else:
+        #     assert isinstance(transducer, Transducer)
+        #     assert transducer.decoder.in_feats == hf_feats.hidden_size
+        #     assert transducer.joiner.in_feats == hf_feats.hidden_size
+
+        super().__init__(hf_feats, transducer, feat_fusion_start,
+                         feat_fusion_method)
+
+
+
+    @staticmethod
+    def filter_args(**kwargs):
+        base_args = HFWav2RNNFiLMTransducer.filter_args(**kwargs)
+        child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"])
+        base_args["transducer"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_class_args(parser, prefix="hf_feats")
+        RNNFiLMTransducer.add_class_args(parser, prefix="transducer")
+        HFWav2RNNFiLMTransducer.add_class_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+        child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"])
+        base_args["transducer"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats")
+        RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+
+
+    @staticmethod
+    def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None):
+        model_data = None
+        if cfg is None or state_dict is None:
+            assert file_path is not None
+            model_data = torch.load(file_path)
+        if cfg is None:
+            cfg = model_data["model_cfg"]
+        if state_dict is None and model_data is not None:
+            state_dict = model_data["model_state_dict"]
+
+        if "class_name" in cfg:
+            del cfg["class_name"]
+
+        return cfg, state_dict
+
+    # check again
+
+    @classmethod
+    def load(cls, file_path=None, cfg=None, state_dict=None):
+        cfg, state_dict = TorchModel._load_cfg_state_dict(
+            file_path, cfg, state_dict)
+
+        model = cls(**cfg)
+        if state_dict is not None:
+            # remove the lstm layers from the state_dict
+            # because the lstm are changed to lstm with film
+            state_dict = ODict(
+                [(k, v) for k, v in state_dict.items()
+                    if not k.startswith("lstm")])
+            # initialize the lstm with film with the pretrained lstm
+            initialize_lstm_with_film(
+                model.transducer.predictor.rnn, [(k, v) for k, v in state_dict.items() if k.startswith("lstm")])
+
+            # load the state_dict
+            model.load_state_dict(state_dict, strict=False)
+        return model
\ No newline at end of file
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index b710655e..b9f39de8 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -40,8 +40,10 @@ class HFWav2RNNTransducerLanguageID(TorchModel):
 
     def __init__(self,
                  hf_feats: TorchModel,
-                 transducer: Union[Dict, TorchModel],
-                 languageid: Union[Dict, TorchModel],
+                 transducer: TorchModel,
+                 languageid: TorchModel,
+                 transducer_fuser: TorchModel,
+                 languageid_fuser: TorchModel,
                  feat_fusion_start: int = 0,
                  feat_fusion_method: str = "weighted-avg",
                  loss_weight_transducer: float = 0.005,
@@ -49,46 +51,48 @@ def __init__(self,
 
         super().__init__()
         self.hf_feats = hf_feats
-        if isinstance(transducer, dict):
-            transducer["decoder"]["in_feats"] = hf_feats.hidden_size
-            #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
-            if "class_name" in transducer:
-                del transducer["class_name"]
-
-            transducer["encoder"] = None
-            transducer = RNNTransducer(**transducer)
-        else:
-            assert isinstance(transducer, RNNTransducer)
-            if transducer.encoder is None:
-                assert transducer.decoder.in_feats == hf_feats.hidden_size
-                #assert transducer.joiner.in_feats == hf_feats.hidden_size
+        # if isinstance(transducer, dict):
+        #     transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+        #     #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+        #     if "class_name" in transducer:
+        #         del transducer["class_name"]
+
+        #     transducer["encoder"] = None
+        #     transducer = RNNTransducer(**transducer)
+        # else:
+        #     assert isinstance(transducer, RNNTransducer)
+        #     if transducer.encoder is None:
+        #         assert transducer.decoder.in_feats == hf_feats.hidden_size
+        #         #assert transducer.joiner.in_feats == hf_feats.hidden_size
 
         self.transducer = transducer
         self.languageid = languageid
+        self.transducer_fuser = transducer_fuser
+        self.languageid_fuser = languageid_fuser
+
         self.feat_fusion_start = feat_fusion_start
         self.feat_fusion_method = feat_fusion_method
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
         self._hf_context = contextlib.nullcontext()
-        self._make_fuser()
-
-    def _make_fuser(self):
-        if self.feat_fusion_method == "last":
-            self.feat_fuser = None
-            return
 
-        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
-        layer_dim = self.hf_feats.hidden_size
-        if self.feat_fusion_method == "weighted-avg":
-            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
-        elif self.feat_fusion_method == "linear":
-            self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
-            self.feat_fuser.weight.data = torch.ones(1,
-                                                     num_layers) / num_layers
-        elif self.feat_fusion_method == "cat":
-            self.feat_fuser = nn.Linear(num_layers * layer_dim,
-                                        layer_dim,
-                                        bias=False)
+    # def _make_fuser(self, transducer_fuser, languageid_fuser):
+    #     if self.feat_fusion_method == "last":
+    #         self.feat_fuser = None
+    #         return
+
+    #     num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+    #     layer_dim = self.hf_feats.hidden_size
+    #     if self.feat_fusion_method == "weighted-avg":
+    #         self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+    #     elif self.feat_fusion_method == "linear":
+    #         self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
+    #         self.feat_fuser.weight.data = torch.ones(1,
+    #                                                  num_layers) / num_layers
+    #     elif self.feat_fusion_method == "cat":
+    #         self.feat_fuser = nn.Linear(num_layers * layer_dim,
+    #                                     layer_dim,
+    #                                     bias=False)
 
     def _fuse_hid_feats(self, hid_feats):
         """Fuses the hidden features from the Wav2Vec model.
@@ -106,18 +110,23 @@ def _fuse_hid_feats(self, hid_feats):
         hid_feats = hid_feats[self.feat_fusion_start:]
         if self.feat_fusion_method == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
-            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+            norm_weights_transducer = nn.functional.softmax(self.transducer_fuser, dim=-1)
+            norm_weights_languageid = nn.functional.softmax(self.languageid_fuser, dim=-1)
+            feats_transducer = torch.sum(hid_feats * norm_weights_transducer, dim=-1)
+            feats_languageid = torch.sum(hid_feats * norm_weights_languageid, dim=-1)
         elif self.feat_fusion_method == "linear":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            feats = self.feat_fuser(hid_feats).squeeze(dim=-1)
+            feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1)
+            feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1)
         elif self.feat_fusion_method == "cat":
             hid_feats = torch.cat(hid_feats, dim=-1)
-            feats = self.feat_fuser(hid_feats)
+            feats_transducer = self.transducer_fuser(hid_feats)
+            feats_languageid = self.languageid_fuser(hid_feats)
         elif self.feat_fusion_method == "last":
-            feats = hid_feats[-1]
+            feats_transducer = hid_feats[-1]
+            feats_languageid = hid_feats[-1]
 
-        return feats
+        return feats_transducer, feats_languageid
 
     def forward_feats(self,
                       x,
@@ -138,12 +147,14 @@ def forward_feats(self,
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
-            feats = self._fuse_hid_feats(hid_feats)
+            feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats)
         else:
             hid_feats = None
-            feats = hf_output["last_hidden_state"]
+            feats_transducer = hf_output["last_hidden_state"]
+            feats_languageid = hf_output["last_hidden_state"]
 
-        feats = feats.transpose(1, 2)
+        feats_transducer = feats_transducer.transpose(1, 2)
+        feats_languageid = feats_languageid.transpose(1, 2)
         if return_feat_layers is not None:
             # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time)
             # as the hidden features of the x-vector encoder.
@@ -154,7 +165,7 @@ def forward_feats(self,
         else:
             hid_feats = None
 
-        return feats, hid_feats, feat_lengths
+        return feats_transducer, feats_languageid, hid_feats, feat_lengths
 
     def forward(
         self,
@@ -185,13 +196,13 @@ def forward(
           Dataclass with losses, "h_enc" (list of hidden encoder layers),
           "h_feats" (wav2vec features)
         """
-        feats, hid_feats, feat_lengths = self.forward_feats(
+        feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats(
             x, x_lengths, return_feat_layers)
-            
-
 
+        feats_transducer = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+            
         logits = self.languageid(
-            feats,
+            feats_languageid,
             None,
             languageid,
             return_enc_layers=return_enc_layers,
@@ -200,21 +211,17 @@ def forward(
         )
 
         loss_lid = nn.CrossEntropyLoss()(logits, languageid)
-
-
-
-        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+        
         trans_output = self.transducer(
-            feats,
+            feats_transducer,
             feat_lengths,
             text,
         )
 
 
-
         if return_feat_layers:
             trans_output.h_feats = hid_feats
-        output = RNNTransducerLanguageIDOutput( self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats)
+        output = RNNTransducerLanguageIDOutput(self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats)
         return output
 
     def infer(self,
@@ -236,9 +243,9 @@ def infer(self,
           List of list of integer indexes of the recognizer's symbols.
         """
 
-        feats, _, feat_lengths = self.forward_feats(x, x_lengths)
+        feats_transducer, _, _, feat_lengths = self.forward_feats(x, x_lengths)
 
-        feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+        feats = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
 
         y = self.transducer.infer(feats,
                                   feat_lengths,
@@ -341,11 +348,14 @@ def filter_args(**kwargs):
     def get_config(self):
         hf_cfg = self.hf_feats.get_config()
         tran_cfg = self.transducer.get_config()
+        lid_cfg = self.languageid.get_config()
         del hf_cfg["class_name"]
         del tran_cfg["class_name"]
+        del lid_cfg["class_name"]
         config = {
             "hf_feats": hf_cfg,
             "transducer": tran_cfg,
+            "languageid": lid_cfg,
             "feat_fusion_start": self.feat_fusion_start,
             "feat_fusion_method": self.feat_fusion_method,
             "loss_weight_transducer": self.loss_weight_transducer,
diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py
index 4fe8b4ed..049f5d23 100644
--- a/hyperion/torch/narchs/__init__.py
+++ b/hyperion/torch/narchs/__init__.py
@@ -22,6 +22,7 @@
 from .resnet_factory import ResNetFactory
 from .rnn_encoder import RNNEncoder
 from .rnn_transducer_decoder import RNNTransducerDecoder
+from .rnn_film_transducer_decoder import RNNFiLMTransducerDecoder
 from .spinenet import *
 from .spinenet_factory import SpineNetFactory
 from .tdnn import TDNNV1
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
new file mode 100644
index 00000000..cf1652b5
--- /dev/null
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -0,0 +1,843 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+from dataclasses import dataclass
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torchaudio
+import torchaudio.functional
+from jsonargparse import ActionParser, ArgumentParser, ActionYesNo
+
+try:
+    import k2
+except ModuleNotFoundError:
+    from ...utils import dummy_k2 as k2
+
+from ...utils.misc import filter_func_args
+from ...utils.text import add_sos
+from ..layer_blocks import TransducerJoiner as Joiner
+from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor
+from .net_arch import NetArch
+
+
+@dataclass
+class Hypothesis:
+    ys: List[int]  # predicted sequences
+    log_prob: float  # log prob of ys
+
+    # Optional LSTM predictor state.
+    pred_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+
+
+class RNNFiLMTransducerDecoder(NetArch):
+    """ RNN-T Decoder composed of Predictor and Joiner networks
+    Implementation based on 
+    https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py
+
+    Attributes:
+      in_feats: input features dimension (encoder output)
+      vocab_size: Number of tokens of the modeling unit including blank.
+      predictor: Dictionary with the predictor options.
+      joiner: Dictionary with the joiner options.
+      blank_id: id of the null symbol.
+      rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned.
+      rnnt_type: rnn-t variation between regular, modified or constrained.
+      delay_penalty: penalize symbol delay, which is used to make symbol 
+        emit earlier.
+      reduction: type of reduction for rnn-t loss between sum or mean
+      prune_range: how many symbols to keep for each frame in k2 rnn-t 
+        pruned loss.
+      lm_scale: language model scale in rnn-t smoothed loss.
+      am_scale: acoustic model scale in rnn-t smoothed loss.
+      simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss.
+      pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss 
+        from 0.1 to 1.
+    """
+
+    def __init__(
+        self,
+        in_feats: int,
+        vocab_size: int,
+        predictor: Dict,
+        joiner: Dict,
+        blank_id: int = 0,
+        rnnt_loss: str = "k2_pruned",
+        rnnt_type: str = "regular",
+        delay_penalty: float = 0.0,
+        reduction: str = "sum",
+        prune_range: int = 5,
+        lm_scale: float = 0.25,
+        am_scale: float = 0.0,
+        simple_loss_scale: float = 0.5,
+        pruned_warmup_steps: int = 2000,
+        langs_size: int = 13,
+        condition_size: int = 64,
+    ):
+
+        super().__init__()
+        self.in_feats = in_feats
+        self.vocab_size = vocab_size
+        self.predictor_args = predictor
+        self.joiner_args = joiner
+        self.blank_id = blank_id
+        self.rnnt_loss = rnnt_loss
+        self.rnnt_type = rnnt_type
+        self.delay_penalty = delay_penalty
+        self.reduction = reduction
+        self.prune_range = prune_range
+        self.lm_scale = lm_scale
+        self.am_scale = am_scale
+        self.simple_loss_scale = simple_loss_scale
+        self.pruned_warmup_steps = pruned_warmup_steps
+        self.condition_size = condition_size
+
+
+        self._make_predictor()
+        self._make_joiner()
+        # make embedding layer for language id
+        self.lang_embedding = nn.Embedding(langs_size, condition_size)
+        if self.rnnt_loss == "k2_pruned":
+            self.simple_am_proj = nn.Linear(in_feats, vocab_size)
+            self.simple_lm_proj = nn.Linear(self.predictor.out_feats,
+                                            vocab_size)
+            self.register_buffer("cur_step", torch.as_tensor(0,
+                                                             dtype=torch.int))
+
+    def _make_predictor(self):
+        pred_type = self.predictor_args["pred_type"]
+        self.predictor_args["in_feats"] = self.in_feats
+        self.predictor_args["vocab_size"] = self.vocab_size
+        self.predictor_args["blank_id"] = self.blank_id
+        self.predictor_args["condition_size"] = self.condition_size
+        # Add FiLM args to the predictor args
+        if pred_type == "rnn":
+            pred_args = filter_func_args(RNNPredictor.__init__,
+                                         self.predictor_args)
+            self.predictor = RNNPredictor(**pred_args)
+        # elif pred_type == "conv":
+        #     pred_args = filter_func_args(ConvPredictor.__init__,
+        #                                  self.predictor_args)
+        #     self.predictor = ConvPredictor(**pred_args)
+        else:
+            raise ValueError(f"Unknown predictor type {pred_type}")
+
+    def _make_joiner(self):
+        joiner_type = self.joiner_args["joiner_type"]
+        # Add FiLM args to the joiner args
+
+        if joiner_type == "basic":
+            pred_feats = self.predictor_args["out_feats"]
+            hid_feats = self.joiner_args["hid_feats"]
+            self.joiner = Joiner(self.in_feats, pred_feats, hid_feats,
+                                 self.vocab_size, self.condition_size)
+        else:
+            raise ValueError(f"Unknown joiner type {joiner_type}")
+
+    def get_config(self):
+        config = {
+            "in_feats": self.in_feats,
+            "vocab_size": self.vocab_size,
+            "predictor": self.predictor_args,
+            "joiner": self.joiner_args,
+            "blank_id": self.blank_id,
+            "rnnt_loss": self.rnnt_loss,
+            "rnnt_type": self.rnnt_type,
+            "delay_penalty": self.delay_penalty,
+            "reduction": self.reduction,
+            "prune_range": self.prune_range,
+            "lm_scale": self.lm_scale,
+            "am_scale": self.am_scale,
+            "simple_loss_scale": self.simple_loss_scale,
+            "pruned_warmup_steps": self.pruned_warmup_steps,
+            "condition_size": self.condition_size,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor,
+                              y: torch.Tensor, y_lengths: torch.Tensor,
+                              pred_out: torch.Tensor, lang_embedding: torch.Tensor):
+        logits = self.joiner(x, pred_out, lang_embedding)
+        # rnnt_loss requires 0 padded targets
+        # Note: y does not start with SOS
+        y_padded = y.pad(mode="constant", padding_value=0)
+        x_lengths = x_lengths.to(torch.int32)
+        loss = torchaudio.functional.rnnt_loss(
+            logits=logits,
+            targets=y_padded.to(torch.int32),
+            logit_lengths=x_lengths,
+            target_lengths=y_lengths,
+            blank=self.blank_id,
+            reduction=self.reduction,
+        )
+        return loss
+
+    def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor,
+                      y: torch.Tensor, y_lengths: torch.Tensor,
+                      pred_out: torch.Tensor, lang_embedding: torch.Tensor):
+        y_padded = y.pad(mode="constant", padding_value=0)
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros((x.size(0), 4),
+                               dtype=torch.int64,
+                               device=x.device)
+        boundary[:, 2] = y_lengths
+        boundary[:, 3] = x_lengths
+
+        logits = self.joiner(x, pred_out, lang_embedding)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = k2.rnnt_loss(
+                logits=logits.float(),
+                symbols=y_padded,
+                termination_symbol=self.blank_id,
+                boundary=boundary,
+                rnnt_type=self.rnnt_type,
+                delay_penalty=self.delay_penalty,
+                reduction=self.reduction,
+            )
+        return loss
+
+    def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor,
+                             y: torch.Tensor, y_lengths: torch.Tensor,
+                             pred_out: torch.Tensor, lang_embedding: torch.Tensor):
+
+        y_padded = y.pad(mode="constant", padding_value=0)
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros((x.size(0), 4),
+                               dtype=torch.int64,
+                               device=x.device)
+        boundary[:, 2] = y_lengths
+        boundary[:, 3] = x_lengths
+
+        am_simple = self.simple_am_proj(x)
+        lm_simple = self.simple_lm_proj(pred_out)
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_simple, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm_simple.float(),
+                am=am_simple.float(),
+                symbols=y_padded,
+                termination_symbol=self.blank_id,
+                lm_only_scale=self.lm_scale,
+                am_only_scale=self.am_scale,
+                boundary=boundary,
+                rnnt_type=self.rnnt_type,
+                delay_penalty=self.delay_penalty,
+                reduction=self.reduction,
+                return_grad=True,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=self.prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.enc_proj(x),
+            lm=self.joiner.pred_proj(pred_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False)
+
+
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_pruned = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=self.blank_id,
+                boundary=boundary,
+                rnnt_type=self.rnnt_type,
+                delay_penalty=self.delay_penalty,
+                reduction=self.reduction,
+            )
+
+        if self.cur_step > self.pruned_warmup_steps:
+            simple_loss_scale = self.simple_loss_scale
+            pruned_loss_scale = 1.0
+        else:
+            r = self.cur_step / self.pruned_warmup_steps
+            simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale)
+            pruned_loss_scale = 0.1 + 0.9 * r
+            self.cur_step += 1
+            # print(simple_loss_scale, pruned_loss_scale)
+
+        loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned
+
+        return loss, loss_simple, loss_pruned
+
+    def forward(
+        self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # embed lang
+        lang_embedding = self.lang_embedding(lang)
+        # get y_lengths
+        row_splits = y.shape.row_splits(1)
+        y_lengths = row_splits[1:] - row_splits[:-1]
+        # shift y adding <sos> token
+        sos_y = add_sos(y, sos_id=self.blank_id)
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id)
+        sos_y_padded = sos_y_padded.to(torch.int64)
+        # apply predictor and joiner
+        pred_out, _ = self.predictor(sos_y_padded, lang_embedding)
+        loss_simple = loss_pruned = None
+        if self.rnnt_loss == "k2_pruned":
+            loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned(
+                x, x_lengths, y, y_lengths, pred_out, lang_embedding)
+        elif self.rnnt_loss == "k2":
+            loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out, lang_embedding)
+        elif self.rnnt_loss == "torchaudio":
+            loss_simple = loss_pruned = None
+            loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths,
+                                              pred_out, lang_embedding)
+
+        return loss, loss_simple, loss_pruned
+
+    def decode(self,
+               x: torch.Tensor,
+               lang: torch.Tensor,
+               x_lengths: torch.Tensor = None,
+               method="time_sync_beam_search",
+               beam_width: int = 5,
+               max_sym_per_frame: int = 3,
+               max_sym_per_utt: int = 1000, ) -> List[int]:
+
+        # embed lang
+        lang_embedding = self.lang_embedding(lang)
+        if method == "time_sync_beam_search":
+            return self.decode_time_sync_beam_search(x,
+                                                     lang_embedding,
+                                                     x_lengths,
+                                                     beam_width=beam_width)
+        elif method == "align_length_sync_beam_search":
+            return self.decode_align_length_sync_beam_search(
+                x,
+                x_lengths,
+                lang_embedding,
+                beam_width=beam_width,
+                max_sym_per_utt=max_sym_per_utt)
+        elif method == "greedy":
+            return self.decode_greedy(x,
+                                      lang_embedding,
+                                      x_lengths,
+                                      max_sym_per_frame=max_sym_per_frame,
+                                      max_sym_per_utt=max_sym_per_utt)
+
+    def decode_greedy(self,
+                      x: torch.Tensor,
+                      lang_embedding: torch.Tensor,
+                      x_lengths: torch.Tensor = None,
+                      max_sym_per_frame: int = 3,
+                      max_sym_per_utt: int = 1000) -> List[int]:
+        """
+        Args:
+          x: encoder embeddings with shape = (N, T, C)
+        Returns:
+          Decoded tokens
+        """
+        assert x.ndim == 3
+
+        # support only batch_size == 1 for now
+        assert x.size(0) == 1, x.size(0)
+        blank_id = self.blank_id
+        device = x.device
+
+        sos = torch.Tensor([blank_id], device=device,
+                           dtype=torch.int64).reshape(1, 1)
+        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        T = x.size(1)
+        t = 0
+        hyp = []
+
+        sym_per_frame = 0
+        sym_per_utt = 0
+
+        while t < T and sym_per_utt < max_sym_per_utt:
+            x_t = x[:, t:t + 1, :]
+            logits = self.joiner(x_t, pred_out, lang_embedding)  # (1, 1, 1, vocab_size)
+            # logits is
+
+            log_prob = logits.log_softmax(dim=-1)  # (1, 1, 1, vocab_size)
+            # TODO: Use logits.argmax()
+            y = log_prob.argmax()
+            if y != blank_id:
+                hyp.append(y.item())
+                y = y.reshape(1, 1)
+                pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c))
+
+                sym_per_utt += 1
+                sym_per_frame += 1
+
+            if y == blank_id or sym_per_frame > max_sym_per_frame:
+                sym_per_frame = 0
+                t += 1
+
+        return hyp
+
+    def decode_time_sync_beam_search(self,
+                                     x: torch.Tensor,
+                                     lang_embedding: torch.Tensor,
+                                     x_lengths: torch.Tensor = None, 
+                                     beam_width: int = 5) -> List[int]:
+        assert x.ndim == 3
+        assert x.size(0) == 1, x.size(0)
+
+        blank_id = self.blank_id
+        device = x.device
+
+        sos = torch.Tensor([blank_id], device=device).reshape(1, 1)
+        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        T = x.size(1)
+        t = 0
+        B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)]
+        max_u = 20000  # terminate after this number of steps
+        u = 0
+
+        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor,
+                                                   torch.Tensor]]] = {}
+
+        while t < T and u < max_u:
+            x_t = x[:, t:t + 1, :]
+            A = B
+            B = []
+
+            while u < max_u:
+                y_star = max(A, key=lambda hyp: hyp.log_prob)
+                A.remove(y_star)
+
+                # Note: y_star.ys is unhashable, i.e., cannot be used
+                # as a key into a dict
+                cached_key = "_".join(map(str, y_star.ys))
+
+                if cached_key not in cache:
+                    pred_in = torch.Tensor([y_star.ys[-1]],
+                                           device=device).reshape(1, 1)
+
+                    pred_out, pred_state = self.predictor(
+                        pred_in,
+                        lang,
+                        y_star.pred_state,
+                    )
+                    cache[cached_key] = (pred_out, pred_state)
+                else:
+                    pred_out, pred_state = cache[cached_key]
+
+                logits = self.joiner(x_t, pred_out, lang_embedding)
+                log_prob = logits.log_softmax(dim=-1)
+                # log_prob is (1, 1, 1, vocab_size)
+                log_prob = log_prob.squeeze()
+                # Now log_prob is (vocab_size,)
+
+                # If we choose blank here, add the new hypothesis to B.
+                # Otherwise, add the new hypothesis to A
+
+                # First, choose blank
+                skip_log_prob = log_prob[blank_id]
+                new_y_star_log_prob = y_star.log_prob + skip_log_prob.item()
+                # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob,
+                #       skip_log_prob.item(), new_y_star_log_prob)
+                # ys[:] returns a copy of ys
+                new_y_star = Hypothesis(
+                    ys=y_star.ys[:],
+                    log_prob=new_y_star_log_prob,
+                    # Caution: Use y_star.decoder_state here
+                    pred_state=y_star.pred_state,
+                )
+                B.append(new_y_star)
+
+                topk_log_prob = log_prob.topk(beam_width, dim=-1)
+
+                # Second, choose other labels
+                #for i, v in enumerate(log_prob.tolist()):
+                for v, i in zip(*topk_log_prob):
+                    v = v.item()
+                    i = i.item()
+                    if i == blank_id:
+                        continue
+                    new_ys = y_star.ys + [i]
+                    new_log_prob = y_star.log_prob + v
+                    new_hyp = Hypothesis(
+                        ys=new_ys,
+                        log_prob=new_log_prob,
+                        pred_state=pred_state,
+                    )
+                    A.append(new_hyp)
+
+                u += 1
+                # check whether B contains more than "beam" elements more probable
+                # than the most probable in A
+                A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
+                #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
+                B = sorted(
+                    [
+                        hyp
+                        for hyp in B if hyp.log_prob > A_most_probable.log_prob
+                    ],
+                    key=lambda hyp: hyp.log_prob,
+                    reverse=True,
+                )
+                # print("tuAB2",
+                #       t,
+                #       u,
+                #       len(A),
+                #       A_most_probable.log_prob,
+                #       len(B),
+                #       flush=True)
+                if len(B) >= beam_width:
+                    B = B[:beam_width]
+                    break
+            t += 1
+
+        try:
+            best_hyp = max(B,
+                            key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        except:
+            return ""
+        ys = best_hyp.ys[1:]  # [1:] to remove the blank
+        return ys
+
+    def decode_align_length_sync_beam_search(
+            self,
+            x: torch.Tensor,
+            x_lengths: torch.Tensor,
+            lang_embedding: torch.Tensor,
+            beam_width: int = 5,
+            max_sym_per_utt: int = 1000) -> List[int]:
+        assert x.ndim == 3
+        assert x.size(0) == 1, x.size(0)
+
+        blank_id = self.blank_id
+        device = x.device
+
+        sos = torch.Tensor([blank_id], device=device).reshape(1, 1)
+        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        T = x.size(1)
+        #t = 0
+        B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)]
+        #max_u = 20000  # terminate after this number of steps
+        #u = 0
+
+        cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor,
+                                                   torch.Tensor]]] = {}
+        F = []
+        #for t < T and u < max_u:
+        for i in range(T + max_sym_per_utt):
+            A = []
+            for y_star in B:
+                #while u < max_u:
+                u = len(y_star.ys) - 1
+                t = i - u
+                if t >= T:
+                    continue
+
+                #y_star = max(A, key=lambda hyp: hyp.log_prob)
+                #A.remove(y_star)
+                x_t = x[:, t:t + 1, :]
+                # Note: y_star.ys is unhashable, i.e., cannot be used
+                # as a key into a dict
+                cached_key = "_".join(map(str, y_star.ys))
+
+                if cached_key not in cache:
+                    pred_in = torch.Tensor([y_star.ys[-1]],
+                                           device=device).reshape(1, 1)
+
+                    pred_out, pred_state = self.predictor(
+                        pred_in,
+                        lang_embedding,
+                        y_star.pred_state,
+                    )
+                    cache[cached_key] = (pred_out, pred_state)
+                else:
+                    pred_out, pred_state = cache[cached_key]
+
+                logits = self.joiner(x_t, pred_out, lang_embedding)
+                log_prob = logits.log_softmax(dim=-1)  # (1, 1, 1, vocab_size)
+                log_prob = log_prob.squeeze()  # (vocab_size,)
+
+                # First, choose blank
+                skip_log_prob = log_prob[blank_id]
+                new_y_star_log_prob = y_star.log_prob + skip_log_prob.item()
+                # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob,
+                #       skip_log_prob.item(), new_y_star_log_prob)
+                # ys[:] returns a copy of ys
+                new_y_star = Hypothesis(
+                    ys=y_star.ys[:],
+                    log_prob=new_y_star_log_prob,
+                    # Caution: Use y_star.pred_state here
+                    pred_state=y_star.pred_state,
+                )
+                A.append(new_y_star)
+                if t == T - 1:
+                    F.append(y_star)
+
+                topk_log_prob = log_prob.topk(beam_width, dim=-1)
+
+                # Second, choose other labels
+                #for i, v in enumerate(log_prob.tolist()):
+                for v, i in zip(*topk_log_prob):
+                    v = v.item()
+                    i = i.item()
+                    if i == blank_id:
+                        continue
+                    new_ys = y_star.ys + [i]
+                    new_log_prob = y_star.log_prob + v
+                    new_hyp = Hypothesis(
+                        ys=new_ys,
+                        log_prob=new_log_prob,
+                        pred_state=pred_state,
+                    )
+                    A.append(new_hyp)
+
+                # check whether B contains more than "beam_width" elements more probable
+                # than the most probable in A
+                #A_most_probable = max(A, key=lambda hyp: hyp.log_prob)
+                #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B))
+                B0 = sorted(
+                    [hyp for hyp in A],
+                    key=lambda hyp: hyp.log_prob,
+                    reverse=True,
+                )
+                B = []
+                B_ys = set()
+                for hyp in B0:
+                    hyp_ys = tuple(hyp.ys)  # to make ys hashable
+                    if hyp_ys not in B_ys:
+                        B.append(hyp)
+                        B_ys.add(hyp_ys)
+                # print("tuAB2",
+                #       t,
+                #       u,
+                #       len(A),
+                #       A_most_probable.log_prob,
+                #       len(B),
+                #       flush=True)
+                if len(B) >= beam_width:
+                    B = B[:beam_width]
+                    break
+
+        best_hyp = max(F,
+                       key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:])))
+        ys = best_hyp.ys[1:]  # [1:] to remove the blank
+        return ys
+
+    def change_config(
+        self,
+        override_dropouts=False,
+        embed_dropout_rate: float = 0.0,
+        rnn_dropout_rate: float = 0.0,
+        prune_range: Optional[int] = None,
+    ):
+        logging.info("changing decoder config")
+        self.predictor.change_config(override_dropouts, embed_dropout_rate,
+                                     rnn_dropout_rate)
+        if prune_range is not None:
+            self.prune_range = prune_range
+
+    @staticmethod
+    def filter_args(**kwargs):
+        args = filter_func_args(RNNFiLMTransducerDecoder.__init__, kwargs)
+        return args
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        args = filter_func_args(RNNFiLMTransducerDecoder.change_config, kwargs)
+        return args
+
+    @staticmethod
+    def add_pred_args(parser):
+
+        pred_parser = ArgumentParser(prog="")
+        pred_parser.add_argument(
+            "--pred-type",
+            default="rnn",
+            choices=["rnn", "conv"],
+            help=
+            """type of predictor between RNN and Convolutional [rnn, conv]""")
+        pred_parser.add_argument("--embed-dim",
+                                 default=1024,
+                                 type=int,
+                                 help=("token embedding dimension"))
+        pred_parser.add_argument(
+            "--embed-dropout-rate",
+            default=0.0,
+            type=float,
+            help=("dropout prob for predictor input embeddings"))
+        pred_parser.add_argument("--rnn-dropout-rate",
+                                 default=0.0,
+                                 type=float,
+                                 help="""dropout prob for decoder RNN """)
+        pred_parser.add_argument(
+            "--rnn-type",
+            default="lstm",
+            choices=["lstm", "gru"],
+            help=
+            """type of recurrent network for thep predictor in [lstm, gru]""")
+
+        pred_parser.add_argument("--num-layers",
+                                 default=2,
+                                 type=int,
+                                 help="""number of layers of the predictor """)
+
+        pred_parser.add_argument("--hid-feats",
+                                 default=512,
+                                 type=int,
+                                 help="""hidden features of the predictor""")
+        pred_parser.add_argument("--out-feats",
+                                 default=512,
+                                 type=int,
+                                 help="""output features of the predictor""")
+        pred_parser.add_argument("--context-size",
+                                 default=2,
+                                 type=int,
+                                 help="""context length of the convolutional 
+                                 predictor, 1->bigram, 2-> trigram,...""")
+
+        parser.add_argument("--predictor",
+                            action=ActionParser(parser=pred_parser))
+
+    @staticmethod
+    def add_joiner_args(parser):
+
+        pred_parser = ArgumentParser(prog="")
+        pred_parser.add_argument(
+            "--joiner-type",
+            default="basic",
+            choices=["basic"],
+            help=
+            """type of joiner network, there is only basic joiner for now""")
+        pred_parser.add_argument("--hid-feats",
+                                 default=512,
+                                 type=int,
+                                 help="""hidden features of the joiner""")
+        parser.add_argument("--joiner",
+                            action=ActionParser(parser=pred_parser))
+
+    @staticmethod
+    def add_class_args(parser,
+                       prefix=None,
+                       skip=set(["in_feats", "blank_id", "vocab_size"])):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        if "in_feats" not in skip:
+            parser.add_argument("--in-feats",
+                                type=int,
+                                required=True,
+                                help=("input feature dimension"))
+        if "blank_id" not in skip:
+            parser.add_argument("--blank-id",
+                                type=int,
+                                default=0,
+                                help=("blank id from tokenizer model"))
+        if "vocab_size" not in skip:
+            parser.add_argument("--vocab-size",
+                                type=int,
+                                required=True,
+                                help=("output prediction dimension"))
+
+        RNNFiLMTransducerDecoder.add_pred_args(parser)
+        RNNFiLMTransducerDecoder.add_joiner_args(parser)
+        parser.add_argument(
+            "--rnnt-loss",
+            default="k2_pruned",
+            choices=["torchaudio", "k2", "k2_pruned"],
+            help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""")
+        parser.add_argument(
+            "--rnnt-type",
+            default="regular",
+            choices=["regular", "modified", "constrained"],
+            help=
+            """type of rnn-t loss between regular, modified or constrained.""")
+        parser.add_argument(
+            "--delay-penalty",
+            default=0.0,
+            type=float,
+            help=
+            """penalize symbol delay, which is used to make symbol emit earlier
+            for streaming models.""")
+        parser.add_argument(
+            "--reduction",
+            default="sum",
+            choices=["sum", "mean"],
+            help="""type of reduction for rnn-t loss between sum or mean""")
+        parser.add_argument(
+            "--prune-range",
+            default=None,
+            type=Optional[int],
+            help="""how many symbols to keep for each frame in k2 rnn-t 
+            pruned loss.""")
+        parser.add_argument(
+            "--lm-scale",
+            default=0.25,
+            type=float,
+            help="""language model scale in rnn-t smoothed loss""")
+        parser.add_argument(
+            "--am-scale",
+            default=0.0,
+            type=float,
+            help="""acoustic model scale in rnn-t smoothed loss""")
+        parser.add_argument(
+            "--simple-loss-scale",
+            default=0.5,
+            type=float,
+            help="""weight of rnn-t simple loss when using k2 pruned loss""")
+        parser.add_argument(
+            "--pruned-warmup-steps",
+            default=2000,
+            type=int,
+            help="""number of steps to warm up the k2 rnn-t pruned loss 
+            from 0.1 to 1""")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--override-dropouts",
+            default=False,
+            action=ActionYesNo,
+            help=(
+                "whether to use the dropout probabilities passed in the "
+                "arguments instead of the defaults in the pretrained model."))
+        parser.add_argument("--embed-dropout-rate",
+                            default=0.0,
+                            type=float,
+                            help=("dropout prob for decoder input embeddings"))
+        parser.add_argument("--rnn-dropout-rate",
+                            default=0.0,
+                            type=float,
+                            help=("dropout prob for decoder RNN "))
+
+        parser.add_argument(
+            "--prune-range",
+            default=5,
+            type=int,
+            help="""how many symbols to keep for each frame in k2 rnn-t 
+            pruned loss.""")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))

From 20c13e7abb532a3453142124fde7e240fa455ae5 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-106.ec2.internal>
Date: Sun, 14 May 2023 00:35:47 +0000
Subject: [PATCH 22/89] Add FiLMed Transducer

---
 ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml |  6 +-
 ...c2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml | 67 +++++++++++++++++++
 .../v1/global_conf/config_lid_v4.0_13langs.sh |  5 +-
 .../bin/train_wav2vec2rnn_film_transducer.py  | 17 ++---
 hyperion/torch/layer_blocks/film_blocks.py    | 65 +++++-------------
 .../layer_blocks/transducer_film_joiner.py    | 10 ++-
 .../layer_blocks/transducer_film_predictor.py |  4 +-
 .../models/transducer/rnn_film_transducer.py  |  1 -
 .../hf_wav2rnn_film_transducer.py             | 33 +++++++--
 .../narchs/rnn_film_transducer_decoder.py     |  4 +-
 .../trainers/transducer_languageid_trainer.py |  4 +-
 11 files changed, 135 insertions(+), 81 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
index 7e059b3b..7110b50e 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
@@ -6,6 +6,7 @@ data:
         - conf/reverb_noise_aug.yaml
       return_segment_info:
         - text
+        - language
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
@@ -17,7 +18,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.3
+      num_chunks_per_seg_epoch: 0.1
 
     data_loader:
       num_workers: 1
@@ -28,6 +29,7 @@ data:
       wav_scale: 1
       return_segment_info:
         - text
+        - language
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
@@ -59,7 +61,7 @@ model:
         rnn_type: lstm
       joiner:
         hid_feats: 512
-  feat_fusion_method: weighted-avg
+  feat_fusion_method: film-weighted-avg
   feat_fusion_start: 2
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
new file mode 100644
index 00000000..d270d62c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
@@ -0,0 +1,67 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model:
+  languageid:
+    cos_scale: 32.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    #decay_steps: 4200
+    #hold_steps: 1500
+    decay_steps: 16000
+    hold_steps: 32000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 100
+  eff_batch_size: 1024
+  train_mode: full
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
index e6c3afda..ba42ad38 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh
@@ -28,9 +28,8 @@ nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0014.pth
-
-nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.0.yaml
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s2
 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
index 0239820f..f06cc684 100755
--- a/hyperion/bin/train_wav2vec2rnn_film_transducer.py
+++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
@@ -144,15 +144,15 @@ def train_model(gpu_id, args):
     #torch.backends.cudnn.benchmark = False
     # torch.backends.cudnn.enabled = False
 
-    # ddp_args = ddp.filter_ddp_args(**kwargs)
-    # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
-    # kwargs["rank"] = rank
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
 
-    # for Debug
-    rank = 0
-    kwargs["rank"] = 0
-    device = "cpu"
-    world_size=1
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = torch.device("cuda:{}".format(gpu_id))
+    # world_size=1
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
@@ -172,6 +172,7 @@ def train_model(gpu_id, args):
         **trn_args,
     )
     trainer.load_last_checkpoint()
+    # import pdb; pdb.set_trace()
     trainer.fit(train_loader, val_loader)
 
     ddp.ddp_cleanup()
diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
index 8370a42b..5caeab76 100644
--- a/hyperion/torch/layer_blocks/film_blocks.py
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -9,10 +9,15 @@ def __init__(self, input_size, condition_size):
         self.linear_scale = nn.Linear(condition_size, input_size)
         self.linear_shift = nn.Linear(condition_size, input_size)
 
-    def forward(self, x, condition):
-        gamma = self.linear_scale(condition).unsqueeze(2).expand_as(x)
-        beta = self.linear_shift(condition).unsqueeze(2).expand_as(x)
-        x = x * gamma + beta
+    def forward(self, x, lang_condition):
+        if x.ndim == 3:
+            gamma = self.linear_scale(lang_condition).unsqueeze(1).expand_as(x)
+            beta = self.linear_shift(lang_condition).unsqueeze(1).expand_as(x)
+            x = x * gamma + beta
+        elif x.ndim == 4:
+            gamma = self.linear_scale(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x)
+            beta = self.linear_shift(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x)
+            x = x * gamma + beta
         return x
 
 
@@ -30,13 +35,15 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size,
         self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
         self.dropout_layer = nn.Dropout(dropout)
 
-    def forward(self, x, states, condition):
+    def forward(self, x, states, lang_condition):
         outputs = []
-        h, c = states
         new_h, new_c = [], []
         for i, (lstm, film) in enumerate(zip(self.lstms, self.films)):
-            x, (h_i, c_i) = lstm(x, (h[i].unsqueeze(0), c[i].unsqueeze(0)))
-            x = film(x, condition)
+            if states:
+                x, (h_i, c_i) = lstm(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0)))
+            else:
+                x, (h_i, c_i) = lstm(x)
+            x = film(x, lang_condition)
             new_h.append(h_i)
             new_c.append(c_i)
             if i != self.num_layers - 1:
@@ -44,45 +51,5 @@ def forward(self, x, states, condition):
             outputs.append(x)
         new_h = torch.cat(new_h, dim=0)
         new_c = torch.cat(new_c, dim=0)
-        return torch.cat(outputs, dim=0), (new_h, new_c)
+        return x, (new_h, new_c)
 
-
-
-def initialize_lstm_with_film(lstm_with_film, pretrained_dict):
-    # Load pretrained LSTM state_dict
-    pretrained_lstm = pretrained_dict['lstm']
-    pretrained_num_layers = pretrained_dict['num_layers']
-
-    # Copy weights from pretrained LSTM layers to LSTMWithFiLM
-    for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)):
-        if i < pretrained_num_layers:
-            lstm.weight_ih_l0.data.copy_(pretrained_lstm['weight_ih_l' + str(i)])
-            lstm.weight_hh_l0.data.copy_(pretrained_lstm['weight_hh_l' + str(i)])
-            lstm.bias_ih_l0.data.copy_(pretrained_lstm['bias_ih_l' + str(i)])
-            lstm.bias_hh_l0.data.copy_(pretrained_lstm['bias_hh_l' + str(i)])
-        else:
-            # For extra layers in LSTMWithFiLM, just reset the weights
-            nn.init.xavier_uniform_(lstm.weight_ih_l0)
-            nn.init.orthogonal_(lstm.weight_hh_l0)
-            nn.init.zeros_(lstm.bias_ih_l0)
-            nn.init.zeros_(lstm.bias_hh_l0)
-
-
-# def initialize_lstm_with_film(lstm_with_film, pretrained_lstm):
-#     # Copy weights from pretrained LSTM layers to LSTMWithFiLM
-#     for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)):
-#         if i < pretrained_lstm.num_layers:
-#             lstm.weight_ih_l0.data.copy_(pretrained_lstm.weight_ih_l[i])
-#             lstm.weight_hh_l0.data.copy_(pretrained_lstm.weight_hh_l[i])
-#             lstm.bias_ih_l0.data.copy_(pretrained_lstm.bias_ih_l[i])
-#             lstm.bias_hh_l0.data.copy_(pretrained_lstm.bias_hh_l[i])
-#         else:
-#             # For extra layers in LSTMWithFiLM, just reset the weights
-#             nn.init.xavier_uniform_(lstm.weight_ih_l0)
-#             nn.init.orthogonal_(lstm.weight_hh_l0)
-#             nn.init.zeros_(lstm.bias_ih_l0)
-#             nn.init.zeros_(lstm.bias_hh_l0)
-
-
-
-    # rnn = LSTMWithFiLM(embed_dim, hid_feats, num_layers, rnn_dropout_rate, batch_first=True)
\ No newline at end of file
diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py
index 22875258..7fdae60d 100644
--- a/hyperion/torch/layer_blocks/transducer_film_joiner.py
+++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py
@@ -33,8 +33,7 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size:
         self.pred_proj = nn.Linear(pred_feats, hid_feats)
         self.output = nn.Linear(hid_feats, vocab_size)
 
-        self.FiLM_encoder = FiLM(hid_feats, condition_size)
-        self.FiLM_joiner = FiLM(hid_feats, condition_size)
+        self.film = FiLM(hid_feats, condition_size)
         
     def get_config(self):
         config = {
@@ -46,7 +45,7 @@ def get_config(self):
     def forward(self,
             enc_out: torch.Tensor,
             pred_out: torch.Tensor,
-            condition: torch.Tensor, 
+            lang_condition: torch.Tensor, 
             project_input: bool = True) -> torch.Tensor:
         
         """
@@ -60,19 +59,18 @@ def forward(self,
         """
         assert enc_out.ndim == pred_out.ndim
         assert enc_out.ndim in (3, 4)
-
         if enc_out.ndim == 3:
             enc_out = enc_out.unsqueeze(2)  # (N, T, 1, C)
             pred_out = pred_out.unsqueeze(1)  # (N, 1, U, C)
         
-        enc_out = self.FiLM_encoder(enc_out, condition)
+        # enc_out = self.FiLM_encoder(enc_out, lang_condition)
 
         if project_input:
             x = self.enc_proj(enc_out) + self.pred_proj(pred_out)
         else:
             x = enc_out + pred_out
 
-        x = self.FiLM_joiner(x, condition)
+        x = self.film(x, lang_condition)
         
         x = torch.tanh(x)
         logits = self.output(x)
diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py
index 09fae3ec..dbb93218 100644
--- a/hyperion/torch/layer_blocks/transducer_film_predictor.py
+++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py
@@ -93,7 +93,7 @@ def get_config(self):
     def forward(
         self,
         y: torch.Tensor,
-        condition: torch.Tensor,
+        lang_condition: torch.Tensor,
         states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
@@ -106,7 +106,7 @@ def forward(
         """
         embed = self.embedding(y)
         embed = self.embed_dropout(embed)
-        out, (h, c) = self.rnn(embed, states, condition)
+        out, (h, c) = self.rnn(embed, states, lang_condition)
         if self.output_proj:
             out = self.output_proj(out)
 
diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py
index 0e8c2889..444c4521 100644
--- a/hyperion/torch/models/transducer/rnn_film_transducer.py
+++ b/hyperion/torch/models/transducer/rnn_film_transducer.py
@@ -83,7 +83,6 @@ def forward(
             x_lengths[:-1] >= x_lengths[1:]
         ), f"x_lengths={x_lengths}"  # check x_lengths are sorted
         assert lang.size(0) == y.dim0
-        assert lang.size(1) == 1
 
         if self.encoder is not None:
             x, x_lengths = self.encoder(x, x_lengths)
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 48d8084b..7f6b9ba7 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -13,6 +13,7 @@
 
 from ...torch_model import TorchModel
 from ...utils import remove_silence
+from ...layer_blocks import FiLM
 from ..transducer import RNNFiLMTransducer
 
 
@@ -63,7 +64,13 @@ def _make_fuser(self):
 
         num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
         layer_dim = self.hf_feats.hidden_size
-        if self.feat_fusion_method == "weighted-avg":
+        if self.feat_fusion_method == "film-weighted-avg":
+            self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
+            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif self.feat_fusion_method == "weighted-avg-film":
+            self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
+            self.film = FiLM(layer_dim, self.transducer.decoder.condition_size)
+        elif self.feat_fusion_method == "weighted-avg":
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif self.feat_fusion_method == "linear":
             self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
@@ -74,11 +81,12 @@ def _make_fuser(self):
                                         layer_dim,
                                         bias=False)
 
-    def _fuse_hid_feats(self, hid_feats):
+    def _fuse_hid_feats(self, hid_feats, lang):
         """Fuses the hidden features from the Wav2Vec model.
 
         Args:
           hid_feats: list of hidden features Tensors from Wav2Vec model.
+          lang: language id Tensor.
 
         Returns:
           Tensor of fused features (batch, channels, time)
@@ -87,8 +95,19 @@ def _fuse_hid_feats(self, hid_feats):
             # There is only one layer of features
             return hid_feats[0]
 
+        lang_condition = self.transducer.decoder.lang_embedding(lang)
         hid_feats = hid_feats[self.feat_fusion_start:]
-        if self.feat_fusion_method == "weighted-avg":
+        if self.feat_fusion_method == "film-weighted-avg":
+            film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films)))
+            film_hid_feats = torch.stack(film_hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            feats = torch.sum(film_hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method == "weighted-avg-film":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+            feats = self.film(feats, lang_condition)
+        elif self.feat_fusion_method == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
             feats = torch.sum(hid_feats * norm_weights, dim=-1)
@@ -106,6 +125,7 @@ def _fuse_hid_feats(self, hid_feats):
     def forward_feats(self,
                       x,
                       x_lengths,
+                      lang: torch.Tensor, 
                       return_feat_layers=None,
                       chunk_length=0,
                       detach_chunks=False):
@@ -122,7 +142,7 @@ def forward_feats(self,
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
-            feats = self._fuse_hid_feats(hid_feats)
+            feats = self._fuse_hid_feats(hid_feats, lang)
         else:
             hid_feats = None
             feats = hf_output["last_hidden_state"]
@@ -168,8 +188,9 @@ def forward(
           Dataclass with losses, "h_enc" (list of hidden encoder layers),
           "h_feats" (wav2vec features)
         """
+        
         feats, hid_feats, feat_lengths = self.forward_feats(
-            x, x_lengths, return_feat_layers)
+            x, x_lengths, languageid, return_feat_layers)
 
         feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
         output = self.transducer(
@@ -204,7 +225,7 @@ def infer(self,
           List of list of integer indexes of the recognizer's symbols.
         """
 
-        feats, _, feat_lengths = self.forward_feats(x, x_lengths)
+        feats, _, feat_lengths = self.forward_feats(x, x_lengths, languageid)
 
         feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
 
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index cf1652b5..9f42a09c 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -20,8 +20,8 @@
 
 from ...utils.misc import filter_func_args
 from ...utils.text import add_sos
-from ..layer_blocks import TransducerJoiner as Joiner
-from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor
+from ..layer_blocks import TransducerFiLMJoiner as Joiner
+from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor
 from .net_arch import NetArch
 
 
diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py
index 238e8022..2e9df702 100644
--- a/hyperion/torch/trainers/transducer_languageid_trainer.py
+++ b/hyperion/torch/trainers/transducer_languageid_trainer.py
@@ -78,7 +78,7 @@ def __init__(
         swa_anneal_epochs=10,
         cpu_offload=False,
         input_key="x",
-        target_key=["text", "languageid"],
+        target_key=["text", "language"],
     ):
 
         loss = None
@@ -214,7 +214,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
                                     skip=super_skip)
         if "target_key" not in skip:
             parser.add_argument("--target-keys",
-                                default=["text", "languageid"],
+                                default=["text", "language"],
                                 help="list of dict. key for nnet targets")
 
         if prefix is not None:

From f8c84a9977d61e65cc4bc2ab67ce4af792e73836 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-106.ec2.internal>
Date: Sun, 14 May 2023 01:17:23 +0000
Subject: [PATCH 23/89] remove unused function

---
 hyperion/torch/layer_blocks/__init__.py                         | 2 +-
 .../models/wav2transducer/hf_wav2vec2rnn_film_transducer.py     | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py
index 6e2f1eb9..62c096b2 100644
--- a/hyperion/torch/layer_blocks/__init__.py
+++ b/hyperion/torch/layer_blocks/__init__.py
@@ -9,7 +9,7 @@
 from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock
 from .etdnn_blocks import ETDNNBlock
 from .fc_blocks import FCBlock
-from .film_blocks import FiLM, LSTMWithFiLM, initialize_lstm_with_film
+from .film_blocks import FiLM, LSTMWithFiLM
 from .mbconv_blocks import MBConvBlock, MBConvInOutBlock
 from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock
 from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
index e76867bc..6d1ea944 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
@@ -12,7 +12,6 @@
 from ...tpm import HFWav2Vec2
 from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer
 from ..transducer import RNNFiLMTransducer
-from ...layer_blocks import initialize_lstm_with_film
 
 class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer):
     """Class for RNN-T with Wav2Vec2 features

From 05474decc6016ecdf8521c2078dd2b7cc01c7dc1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-195.ec2.internal>
Date: Mon, 15 May 2023 03:13:47 +0000
Subject: [PATCH 24/89] Add decode script and configurations

---
 ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml |  8 +-
 .../global_conf/config_lid_v4.0_13langs_v3.sh | 43 ++++++++++
 egs/commonvoice/v1/local/initailize_model.py  | 55 ++++++++++++
 .../decode_wav2vec2rnn_transducer.sh          | 79 ++++++++++++++++++
 .../decode_wav2vec2rnn_transducer_lid.sh      | 83 +++++++++++++++++++
 5 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
 create mode 100644 egs/commonvoice/v1/local/initailize_model.py
 create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
 create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
index 7110b50e..7d3d133e 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
@@ -11,6 +11,7 @@ data:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
       max_batch_length: 50
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: false
       # for class_weighted_random_bucketing_seg_sampler
@@ -34,6 +35,7 @@ data:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
       max_batch_length: 50
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: true
       # for class_weighted_random_bucketing_seg_sampler
@@ -41,7 +43,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 1.0
+      num_chunks_per_seg_epoch: 0.5
     data_loader:
       num_workers: 1
 model: 
@@ -66,7 +68,7 @@ model:
 trainer:
   optim:
     opt_type: sgd
-    lr: 0.005
+    lr: 0.002
     momentum: 0.9
     weight_decay: 4e-4
   lrsched:
@@ -85,4 +87,4 @@ trainer:
   eff_batch_size: 128
   train_mode: hf-feats-frozen-nograd
 
- 
\ No newline at end of file
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
new file mode 100644
index 00000000..8d6cbc80
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
@@ -0,0 +1,43 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s4
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/local/initailize_model.py b/egs/commonvoice/v1/local/initailize_model.py
new file mode 100644
index 00000000..7ae9db8e
--- /dev/null
+++ b/egs/commonvoice/v1/local/initailize_model.py
@@ -0,0 +1,55 @@
+import torch
+
+# arguments example
+# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth'
+# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth"
+# output_model = "model_initialized.pth"
+
+pretrained_model = torch.load(sys.argv[1])
+film_model = torch.load(sys.argv[2])
+
+output_model = sys.argv[3]
+
+
+def update_film_lstm_parameters(film_state_dict, pretrained_state_dict):
+    for i in range(2):
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone()
+    return film_state_dict
+
+
+def copy_model_parameters(pretrained_model, film_model):
+    pretrained_state_dict = pretrained_model["model_state_dict"]
+    film_state_dict = film_model["model_state_dict"]
+
+    update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape}
+    new_film_state_dict = film_state_dict.copy()
+    new_film_state_dict.update(update_state_dict)
+
+    new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict)
+    
+    film_model["model_state_dict"] = new_film_state_dict
+
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in film_state_dict.items():
+        if torch.all(torch.eq(param, new_film_state_dict[name])):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+
+    for name, param in pretrained_state_dict.items():
+        if name not in changed_parameters:
+            unloaded_parameters.append(name)
+
+    print(f"Unchanged parameters: {unchanged_parameters}")
+    print(f"Unloaded parameters: {unloaded_parameters}")
+    print(f"Changed parameters: {changed_parameters}")
+    film_model["epoch"] =1
+    torch.save(film_model, output_model)
+
+
+unchanged_parameters = copy_model_parameters(pretrained_model, film_model)
\ No newline at end of file
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
new file mode 100755
index 00000000..986c8190
--- /dev/null
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#               2022  Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+nj=30
+cmd="run.pl"
+set -e
+use_gpu=false
+#write_utt2num_frames=true  # If true writes utt2num_frames.
+stage=0
+extra_args=""
+infer_cfg=conf/infer.yaml
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ] && [ $# != 5 ]; then
+  echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> <bpe_model> [<data-out-dir>]"
+  echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --infer-cfg <file|conf/infer.yaml>               # decoding configuration"
+  echo "  --use-gpu <bool|false>                           # If true, use GPU."
+  echo "  --nj <n|30>                                      # Number of jobs"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  
+
+fi
+
+nnet_file=$1
+data_dir=$2
+output_dir=$3
+bpe_model=$4
+
+for f in $data_dir/wav.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+log_dir=$output_dir/log
+mkdir -p $log_dir
+
+num_gpus=0
+if [ "$use_gpu" == "true" ];then
+    cmd="$cmd --gpu 1"
+    num_gpus=1
+    extra_args="${extra_args} --use-gpu"
+fi 
+
+# if [ "$write_utt2num_frames" == "true" ];then
+#     write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
+# fi
+
+if [ $stage -le 0 ];then
+    $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \
+	hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	decode_wav2vec2rnn_transducer.py \
+	--infer-args $infer_cfg \
+	--part-idx JOB --num-parts $nj \
+	--input $data_dir/wav.scp \
+	--model-path $nnet_file \
+	--bpe-model $bpe_model \
+	--output $output_dir/transducer.JOB.text $extra_args
+fi
+
+if [ $stage -le 1 ];then
+  echo "compute wer"
+  cat $output_dir/transducer.*.text > $output_dir/transducer.text
+
+  python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
+
+  python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model
+  python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
+
+  # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
+  # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
+  # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
+
+fi
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh
new file mode 100755
index 00000000..3bf84cbd
--- /dev/null
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#               2022  Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+nj=30
+cmd="run.pl"
+set -e
+use_gpu=false
+#write_utt2num_frames=true  # If true writes utt2num_frames.
+stage=0
+extra_args=""
+infer_cfg=conf/infer.yaml
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ] && [ $# != 6 ]; then
+  echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> <bpe_model> <lang_file> [<data-out-dir>]"
+  echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --infer-cfg <file|conf/infer.yaml>               # decoding configuration"
+  echo "  --use-gpu <bool|false>                           # If true, use GPU."
+  echo "  --nj <n|30>                                      # Number of jobs"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  
+
+fi
+
+nnet_file=$1
+data_dir=$2
+output_dir=$3
+bpe_model=$4
+lang_file=$5
+
+for f in $data_dir/wav.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+log_dir=$output_dir/log
+mkdir -p $log_dir
+
+num_gpus=0
+if [ "$use_gpu" == "true" ];then
+    cmd="$cmd --gpu 1"
+    num_gpus=1
+    extra_args="${extra_args} --use-gpu"
+fi 
+
+# if [ "$write_utt2num_frames" == "true" ];then
+#     write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
+# fi
+
+if [ $stage -le 0 ];then
+    $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \
+	hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	decode_wav2vec2rnn_transducer_languageid.py \
+	--infer-args $infer_cfg \
+	--part-idx JOB --num-parts $nj \
+	--input $data_dir/wav.scp \
+	--model-path $nnet_file \
+	--bpe-model $bpe_model \
+  --lang-file $lang_file \
+	--output_transducer $output_dir/transducer.JOB.text \
+  --output_languageid $output_dir/languageid.JOB $extra_args
+fi
+
+if [ $stage -le 1 ];then
+  echo "compute wer"
+  cat $output_dir/transducer.*.text > $output_dir/transducer.text
+  cat $output_dir/languageid.* > $output_dir/langs
+
+  python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
+
+  # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model
+  # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
+
+  compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
+  compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
+  # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
+
+fi

From cb4f20eb2b03b4243a9e750d9b0039de610eea0d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-20.ec2.internal>
Date: Tue, 16 May 2023 02:56:27 +0000
Subject: [PATCH 25/89] remove redundant code

---
 .../hf_wav2vec2rnn_film_transducer.py         | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
index 6d1ea944..513d193c 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
@@ -104,41 +104,3 @@ def add_finetune_args(parser, prefix=None):
                                       action=ActionParser(parser=parser))
 
 
-
-    @staticmethod
-    def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None):
-        model_data = None
-        if cfg is None or state_dict is None:
-            assert file_path is not None
-            model_data = torch.load(file_path)
-        if cfg is None:
-            cfg = model_data["model_cfg"]
-        if state_dict is None and model_data is not None:
-            state_dict = model_data["model_state_dict"]
-
-        if "class_name" in cfg:
-            del cfg["class_name"]
-
-        return cfg, state_dict
-
-    # check again
-
-    @classmethod
-    def load(cls, file_path=None, cfg=None, state_dict=None):
-        cfg, state_dict = TorchModel._load_cfg_state_dict(
-            file_path, cfg, state_dict)
-
-        model = cls(**cfg)
-        if state_dict is not None:
-            # remove the lstm layers from the state_dict
-            # because the lstm are changed to lstm with film
-            state_dict = ODict(
-                [(k, v) for k, v in state_dict.items()
-                    if not k.startswith("lstm")])
-            # initialize the lstm with film with the pretrained lstm
-            initialize_lstm_with_film(
-                model.transducer.predictor.rnn, [(k, v) for k, v in state_dict.items() if k.startswith("lstm")])
-
-            # load the state_dict
-            model.load_state_dict(state_dict, strict=False)
-        return model
\ No newline at end of file

From 2b61053496a9034c6ede99c4fac00d2e9472fc39 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-77-41.ec2.internal>
Date: Wed, 17 May 2023 19:51:58 +0000
Subject: [PATCH 26/89] update fine-tune and decoding scripts for
 rnnt_film_transducer

---
 ...2base_rnnt_film_k2_pruned_stage2_v1.0.yaml |  75 +++++
 egs/commonvoice/v1/run_031_inference_film.sh  |  49 ++++
 .../decode_wav2vec2rnn_film_transducer.sh     |  82 ++++++
 .../bin/decode_wav2vec2rnn_film_transducer.py | 167 +++++++++++
 .../finetune_wav2vec2rnn_film_transducer.py   | 261 ++++++++++++++++++
 .../models/transducer/rnn_film_transducer.py  |   3 +-
 .../hf_wav2rnn_film_transducer.py             |   9 +-
 .../narchs/rnn_film_transducer_decoder.py     |  12 +-
 8 files changed, 648 insertions(+), 10 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
 create mode 100755 egs/commonvoice/v1/run_031_inference_film.sh
 create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
 create mode 100755 hyperion/bin/decode_wav2vec2rnn_film_transducer.py
 create mode 100755 hyperion/bin/finetune_wav2vec2rnn_film_transducer.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
new file mode 100644
index 00000000..a867f12a
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
@@ -0,0 +1,75 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.5
+    data_loader:
+      num_workers: 1
+model: 
+  transducer:
+    decoder:
+      prune_range: 15
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh
new file mode 100755
index 00000000..7b796107
--- /dev/null
+++ b/egs/commonvoice/v1/run_031_inference_film.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+config_file=default_config.sh
+use_gpu=false
+nnet_stage=1
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ "$use_gpu" == "true" ];then
+  transducer_args="--use-gpu true"
+  transducer_cmd="$cuda_eval_cmd --mem 6G"
+else
+  transducer_cmd="$train_cmd --mem 12G"
+fi
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+fi
+
+transducer_dir=exp/transducer/$nnet_name
+
+
+# test_data=test_clean
+
+
+# Extracts x-vectors for evaluation
+for name in $test_data
+do
+  nj=40
+  steps_transducer/decode_wav2vec2rnn_film_transducer.sh \
+      --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
+      $nnet data/$name \
+      $transducer_dir/$name $bpe_model data/$nnet_data/langs
+done
+
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
new file mode 100755
index 00000000..ebd6398d
--- /dev/null
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+#               2022  Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+nj=30
+cmd="run.pl"
+set -e
+use_gpu=false
+#write_utt2num_frames=true  # If true writes utt2num_frames.
+stage=0
+extra_args=""
+infer_cfg=conf/infer.yaml
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ] && [ $# != 6 ]; then
+  echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> <bpe_model> <lang_file> [<data-out-dir>]"
+  echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --infer-cfg <file|conf/infer.yaml>               # decoding configuration"
+  echo "  --use-gpu <bool|false>                           # If true, use GPU."
+  echo "  --nj <n|30>                                      # Number of jobs"
+  echo "  --stage <stage|0>                                # To control partial reruns"
+  
+
+fi
+
+nnet_file=$1
+data_dir=$2
+output_dir=$3
+bpe_model=$4
+lang_file=$5
+
+for f in $data_dir/wav.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+log_dir=$output_dir/log
+mkdir -p $log_dir
+
+num_gpus=0
+if [ "$use_gpu" == "true" ];then
+    cmd="$cmd --gpu 1"
+    num_gpus=1
+    extra_args="${extra_args} --use-gpu"
+fi 
+
+# if [ "$write_utt2num_frames" == "true" ];then
+#     write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
+# fi
+
+if [ $stage -le 0 ];then
+    $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \
+	hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	decode_wav2vec2rnn_film_transducer.py \
+	--infer-args $infer_cfg \
+	--part-idx JOB --num-parts $nj \
+	--input $data_dir/wav.scp \
+	--lang_input $data_dir/utt2lang \
+	--model-path $nnet_file \
+	--bpe-model $bpe_model \
+  --lang-file $lang_file \
+	--output $output_dir/transducer.JOB.text $extra_args
+fi
+
+if [ $stage -le 1 ];then
+  echo "compute wer"
+  cat $output_dir/transducer.*.text > $output_dir/transducer.text
+
+  python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
+
+  python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model
+  python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
+
+  # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
+  # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
+  # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
+
+fi
diff --git a/hyperion/bin/decode_wav2vec2rnn_film_transducer.py b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py
new file mode 100755
index 00000000..17cb0c3f
--- /dev/null
+++ b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0) 
+"""
+
+import logging
+import os
+import sys
+import time
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
+from hyperion.io import DataWriterFactory as DWF
+from hyperion.io import SequentialAudioReader as AR
+from hyperion.np.augment import SpeechAugment
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.models import HFWav2Vec2RNNFiLMTransducer
+from hyperion.torch.models.wav2transducer.beam_search import (beam_search,
+                                                              greedy_search)
+from hyperion.torch.narchs import AudioFeatsMVN as AF
+from hyperion.torch.utils import open_device
+from hyperion.utils import Utt2Info
+from hyperion.utils.class_info import ClassInfo
+from hyperion.utils.segment_set import SegmentSet
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+
+
+def init_device(use_gpu):
+    set_float_cpu("float32")
+    num_gpus = 1 if use_gpu else 0
+    logging.info("initializing devices num_gpus={}".format(num_gpus))
+    device = open_device(num_gpus=num_gpus)
+    return device
+
+
+def load_model(model_path, device):
+    logging.info("loading model {}".format(model_path))
+    model = TML.load(model_path)
+    logging.info("transducer-film-model={}".format(model))
+    model.to(device)
+    model.eval()
+    return model
+
+
+def decode_transducer(input_spec, lang_input_spec, output_spec, scp_sep, model_path, bpe_model, lang_file,
+                      infer_args, use_gpu, **kwargs):
+
+    device = init_device(use_gpu)
+    model = load_model(model_path, device)
+
+    # load language dict form langfile by row number
+    lang_info = ClassInfo.load(lang_file)
+    utt2lang = SegmentSet.load(lang_input_spec)
+
+
+    logging.info("bpe-model=%s", bpe_model)
+    sp = spm.SentencePieceProcessor()
+    sp.load(bpe_model)
+
+    infer_args = HFWav2Vec2RNNFiLMTransducer.filter_infer_args(**infer_args)
+    logging.info(f"infer-args={infer_args}")
+
+    ar_args = AR.filter_args(**kwargs)
+    logging.info("opening output: %s", output_spec)
+    with open(output_spec, "w") as writer:
+        logging.info(f"opening input stream: {input_spec} with args={ar_args}")
+        with AR(input_spec, **ar_args) as reader:
+            while not reader.eof():
+                t1 = time.time()
+                key, x, fs = reader.read(1)
+                lang = utt2lang.loc[key, "class_id"]
+                lang_id = torch.tensor([lang_info.loc[lang, "class_idx"]]).to(torch.int64)
+                if len(key) == 0:
+                    break
+
+                x, key, fs = x[0], key[0], fs[0]
+                t2 = time.time()
+                logging.info("processing utt %s", key)
+                with torch.no_grad():
+                    x = torch.tensor(
+                        x[None, :], dtype=torch.get_default_dtype()).to(device)
+
+                    tot_frames = x.shape[1]
+                    logging.info(
+                        "utt %s detected %d/%d (%.2f %%) speech frames",
+                        key,
+                        x.shape[1],
+                        tot_frames,
+                        x.shape[1] / tot_frames * 100,
+                    )
+
+                    if x.shape[1] == 0:
+                        y = [""]
+                    else:
+                        #y = decode_one_batch(model=model, sp=sp, x=x)
+                        x_lengths = torch.tensor((x.shape[1], ),
+                                                 dtype=torch.long,
+                                                 device=device)
+                                                 
+                        y = model.infer(x=x, x_lengths=x_lengths, languageid=lang_id, **infer_args)
+
+                    y = sp.decode(y[0])
+                    logging.info(f"utt: {key} hyps: {y}")
+                    t3 = time.time()
+                    writer.write(f"{key} {y}\n")
+
+                    t4 = time.time()
+                    tot_time = t4 - t1
+                    infer_time = t3 - t2
+                    logging.info(
+                        ("utt %s total-time=%.3f read-time=%.3f "
+                         "infer-time=%.3f "
+                         "write-time=%.3f "
+                         "infer-rt-factor=%.2f tot-rt-factor=%.2f"),
+                        key,
+                        tot_time,
+                        t2 - t1,
+                        infer_time,
+                        t4 - t3,
+                        x.shape[1] / fs / infer_time,
+                        x.shape[1] / fs / tot_time,
+                    )
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description=("ASR decoding for RNN-T with Wav2vec features"))
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument("--input", dest="input_spec", required=True)
+    parser.add_argument("--lang_input", dest="lang_input_spec", required=True)
+    parser.add_argument("--scp-sep",
+                        default=" ",
+                        help=("scp file field separator"))
+
+    AR.add_class_args(parser)
+    parser.add_argument("--model-path", required=True)
+    parser.add_argument("--bpe-model", required=True)
+    parser.add_argument("--lang-file", required=True)
+
+    HFWav2Vec2RNNFiLMTransducer.add_infer_args(parser, "infer-args")
+    parser.add_argument("--output", dest="output_spec", required=True)
+    parser.add_argument("--use-gpu",
+                        default=False,
+                        action="store_true",
+                        help="extract xvectors in gpu")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    decode_transducer(**namespace_to_dict(args))
diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py
new file mode 100755
index 00000000..5ff51348
--- /dev/null
+++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer,
+                                   HFWav2Vec2RNNFiLMTransducer)
+from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
+model_dict = {
+    "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
+    "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer,
+    "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer,
+    # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer,
+    # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer,
+    # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer,
+    # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer,
+}
+
+
+def transducer_collate(batch):
+    audio = []
+    audio_length = []
+    target = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        target.append(record["text"])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    # FiLM: add language ID to the input
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": target,
+        "language": language,
+    }
+    return batch
+
+
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate)
+    return data_loader
+
+
+def init_model(in_model_file, rank, model_class, **kwargs):
+    model_args = model_class.filter_finetune_args(**kwargs["model"])
+    # model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network ft args={}".format(model_args))
+    model = TML.load(in_model_file)
+    model.change_config(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    model = init_model(**kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {} 
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+    
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str, 
+    )
+    
+    parser.add_argument("--data.val.dataset.text_file", type=str) 
+    
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str, 
+    )
+
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
+
+    parser.link_arguments(
+        "data.train.dataset.bpe_model", "data.val.dataset.bpe_model"
+    )
+
+
+    parser.add_argument("--in-model-file", required=True)
+    model_class.add_finetune_args(parser, prefix="model")
+    # model_class.add_class_args(parser, prefix="model")
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Fine-tune  Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py
index 444c4521..68066442 100644
--- a/hyperion/torch/models/transducer/rnn_film_transducer.py
+++ b/hyperion/torch/models/transducer/rnn_film_transducer.py
@@ -105,6 +105,7 @@ def infer(self,
         Args:
           x: input features with shape = (N, T, C)
           x_lengths: feature number for frames with shape = (N,)
+          lang: language id for each utterance with shape = (N,)
           decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
           max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
           max_sym_per_utt: maximimum number of symbols in a single utterance.
@@ -251,4 +252,4 @@ def add_infer_args(parser, prefix=None):
 
     @staticmethod
     def filter_infer_args(**kwargs):
-        return filter_func_args(RNNTransducer.infer, kwargs)
+        return filter_func_args(RNNFiLMTransducer.infer, kwargs)
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 7f6b9ba7..dc28abb7 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -179,6 +179,7 @@ def forward(
           x: input features tensor with shape=(batch, in_feats, time)
           x_lengths: time lengths of the features with shape=(batch,)
           y: target classes torch.long tensor with shape=(batch,)
+          languageid: language id torch.long tensor with shape=(batch,)
           return_feat_layers: list of integers indicating, which wav2vec layers
                              we should return. If None, no wav2vec layers are returned.
           return_enc_layers: list of integers indicating, which encoder layers
@@ -208,7 +209,7 @@ def forward(
     def infer(self,
               x: torch.Tensor,
               x_lengths: torch.Tensor,
-              langugeid: torch.Tensor,
+              languageid: torch.Tensor,
               decoding_method="time_sync_beam_search",
               beam_width: int = 5,
               max_sym_per_frame: int = 3,
@@ -218,20 +219,22 @@ def infer(self,
         Args:
           x: input features with shape = (N, T, C)
           x_lengths: feature number for frames with shape = (N,)
+          languageid: language id torch.long tensor with shape=(batch,)
           decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
           max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
           max_sym_per_utt: maximimum number of symbols in a single utterance.
         Returns:
           List of list of integer indexes of the recognizer's symbols.
         """
-
+        # import pdb; pdb.set_trace()
+        languageid = languageid[0]
         feats, _, feat_lengths = self.forward_feats(x, x_lengths, languageid)
 
         feats = feats.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
 
         y = self.transducer.infer(feats,
                                   feat_lengths,
-                                  langugeid,
+                                  languageid,
                                   decoding_method=decoding_method,
                                   beam_width=beam_width,
                                   max_sym_per_frame=max_sym_per_frame,
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 9f42a09c..91a30caf 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -356,7 +356,7 @@ def decode_greedy(self,
         blank_id = self.blank_id
         device = x.device
 
-        sos = torch.Tensor([blank_id], device=device,
+        sos = torch.tensor([blank_id], device=device,
                            dtype=torch.int64).reshape(1, 1)
         pred_out, (h, c) = self.predictor(sos, lang_embedding)
         T = x.size(1)
@@ -399,7 +399,7 @@ def decode_time_sync_beam_search(self,
         blank_id = self.blank_id
         device = x.device
 
-        sos = torch.Tensor([blank_id], device=device).reshape(1, 1)
+        sos = torch.tensor([blank_id], device=device).reshape(1, 1)
         pred_out, (h, c) = self.predictor(sos, lang_embedding)
         T = x.size(1)
         t = 0
@@ -424,12 +424,12 @@ def decode_time_sync_beam_search(self,
                 cached_key = "_".join(map(str, y_star.ys))
 
                 if cached_key not in cache:
-                    pred_in = torch.Tensor([y_star.ys[-1]],
+                    pred_in = torch.tensor([y_star.ys[-1]],
                                            device=device).reshape(1, 1)
 
                     pred_out, pred_state = self.predictor(
                         pred_in,
-                        lang,
+                        lang_embedding,
                         y_star.pred_state,
                     )
                     cache[cached_key] = (pred_out, pred_state)
@@ -523,7 +523,7 @@ def decode_align_length_sync_beam_search(
         blank_id = self.blank_id
         device = x.device
 
-        sos = torch.Tensor([blank_id], device=device).reshape(1, 1)
+        sos = torch.tensor([blank_id], device=device).reshape(1, 1)
         pred_out, (h, c) = self.predictor(sos, lang_embedding)
         T = x.size(1)
         #t = 0
@@ -552,7 +552,7 @@ def decode_align_length_sync_beam_search(
                 cached_key = "_".join(map(str, y_star.ys))
 
                 if cached_key not in cache:
-                    pred_in = torch.Tensor([y_star.ys[-1]],
+                    pred_in = torch.tensor([y_star.ys[-1]],
                                            device=device).reshape(1, 1)
 
                     pred_out, pred_state = self.predictor(

From ca5327a9d1ae66b54340bc1c372ac670284236ef Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-65-37.ec2.internal>
Date: Thu, 18 May 2023 00:23:40 +0000
Subject: [PATCH 27/89] update language id trainer to use chunk for training

---
 egs/commonvoice/v1/conf/infer.yaml            |  2 +
 ...c2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml | 65 +++++++++++++++++++
 .../wav2vec2xlsr300m_ecapatdnn1024x3.yaml     | 43 ++++++++++++
 .../v1/global_conf/config_lid_v4.2_13langs.sh | 43 ++++++++++++
 egs/commonvoice/v1/run_015_train_film_asr.sh  | 10 +--
 hyperion/torch/trainers/languageid_trainer.py | 12 ++--
 hyperion/torch/trainers/torch_trainer.py      |  1 +
 7 files changed, 167 insertions(+), 9 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/infer.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh

diff --git a/egs/commonvoice/v1/conf/infer.yaml b/egs/commonvoice/v1/conf/infer.yaml
new file mode 100644
index 00000000..1f0ebfa7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/infer.yaml
@@ -0,0 +1,2 @@
+beam_width: 5
+decoding_method: time_sync_beam_search
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml
new file mode 100644
index 00000000..12b8c371
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml
@@ -0,0 +1,65 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn1024x3.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.15
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    #decay_steps: 4200
+    #hold_steps: 1500
+    decay_steps: 16000
+    hold_steps: 32000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 100
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml
new file mode 100644
index 00000000..08964a38
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml
@@ -0,0 +1,43 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 1024
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 1024
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 8
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 3072
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 128
+  loss_type: arc-softmax
+  cos_scale: 32.0
+  margin: 0.
+  margin_warmup_epochs: 5
+  intertop_margin: 0.
+  dropout_rate: 0.3
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh
new file mode 100644
index 00000000..49721635
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh
@@ -0,0 +1,43 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_resnet1d_v4.2_13_langs
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v2_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh
index ba1197a8..fbf30558 100755
--- a/egs/commonvoice/v1/run_015_train_film_asr.sh
+++ b/egs/commonvoice/v1/run_015_train_film_asr.sh
@@ -14,13 +14,12 @@ set -e
 #module load cuda/11.6.0
 #ml
 #nvidia-smi
-#export CUDA_VISIBLE_DEVICES=0,1,2,3
+# export CUDA_VISIBLE_DEVICES=0,1,2,3
 #export CONV_RSH=ssh
 #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
 
-# export CUDA_VISIBLE_DEVICES=0,1
 stage=1
-ngpu=1
+ngpu=2
 config_file=default_config.sh
 interactive=false
 num_workers=""
@@ -76,6 +75,7 @@ if [ $stage -le 1 ]; then
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
     --in-model-file $nnet_rnn_transducer \
+    --master-port 1237 \
     --num-gpus $ngpu
 
 fi
@@ -90,7 +90,7 @@ if [ $stage -le 2 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    finetune_wav2vec2rnn_transducer.py $nnet_type \
+    finetune_wav2vec2rnn_film_transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
@@ -107,8 +107,8 @@ if [ $stage -le 2 ]; then
     --in-model-file $nnet_s1 \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1237 \
     --num-gpus $ngpu
-    # --master-port 1236 \
   
 fi
 
diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py
index 0770cb8f..add56c1e 100644
--- a/hyperion/torch/trainers/languageid_trainer.py
+++ b/hyperion/torch/trainers/languageid_trainer.py
@@ -93,7 +93,7 @@ def train_epoch(self, data_loader):
           data_loader: pytorch data loader returning features and class labels.
         """
         batch_keys = [
-            self.input_key, f"{self.input_key}_lengths", self.target_key
+            self.input_key, self.target_key
         ]
 
         self.model.update_loss_margin(self.cur_epoch)
@@ -107,8 +107,10 @@ def train_epoch(self, data_loader):
 
             if batch % self.grad_acc_steps == 0:
                 self.optimizer.zero_grad()
-            input_data, input_lengths, target = tensors_subset(
+            input_data, target = tensors_subset(
                 data, batch_keys, self.device)
+            # input_data, input_lengths, target = tensors_subset(
+                # data, batch_keys, self.device)
             batch_size = input_data.shape[0]
 
             with self.amp_autocast():
@@ -152,7 +154,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
           sw_update_bn: wheter or not, update batch-norm layers in SWA.
         """
         batch_keys = [
-            self.input_key, f"{self.input_key}_lengths", self.target_key
+            self.input_key, self.target_key
         ]
         metric_acc = MetricAcc(self.device)
         batch_metrics = ODict()
@@ -165,8 +167,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
                 self.model.eval()
 
             for batch, data in enumerate(data_loader):
-                input_data, input_lengths, target = tensors_subset(
+                input_data, target = tensors_subset(
                     data, batch_keys, self.device)
+                # input_data, input_lengths, target = tensors_subset(
+                    # data, batch_keys, self.device)
                 batch_size = input_data.shape[0]
                 # data, target = data.to(self.device), target.to(self.device)
                 # batch_size = data.shape[0]
diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py
index 00a218f9..7ae7c50e 100644
--- a/hyperion/torch/trainers/torch_trainer.py
+++ b/hyperion/torch/trainers/torch_trainer.py
@@ -626,6 +626,7 @@ def load_last_checkpoint(self):
         for epoch in range(self.epochs, 0, -1):
             file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch)
             if os.path.isfile(file_path):
+                logging.info("Loading checkpoint %s" % file_path)
                 return self.load_checkpoint(file_path)
 
         return None

From 27d579cb7247bba1983cd4abac42e836e796355a Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Sat, 20 May 2023 17:42:13 -0400
Subject: [PATCH 28/89] sre21 16k recipe finished

---
 egs/sre21-av-a/v1.16k/README.md               |  51 ++-
 .../v1.16k/conf/lresnet34_lid_v1.yaml         |  59 ---
 ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml |   2 +
 ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml |   2 +-
 .../v1.16k/conf/train_lresnet34_lid_v1.yaml   |  78 ++++
 ...train_res2net50w26s4_xvec_stage1_v1.0.yaml |  80 ++++
 ...train_res2net50w26s4_xvec_stage2_v1.0.yaml |  66 +++
 ...train_res2net50w26s8_xvec_stage1_v1.0.yaml |  82 ++++
 ...train_res2net50w26s8_xvec_stage2_v1.0.yaml |  66 +++
 ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml |  83 ++++
 ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml |  66 +++
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |   2 +-
 ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh |  32 +-
 ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh |  67 ----
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  71 +---
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  25 +-
 ...et50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh |   0
 ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh |  49 +++
 ...et50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh |   0
 .../v1.16k/run_011_train_xvector.sh           |   1 -
 .../v1.16k/run_012_finetune_xvector.sh        |  61 ---
 egs/sre21-av-a/v1.16k/run_014_train_lid.sh    |  34 +-
 egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh   |   2 +-
 ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 104 +++++
 ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml |  66 +++
 ...train_res2net50w26s8_xvec_stage1_v1.0.yaml |  82 ++++
 ...train_res2net50w26s8_xvec_stage2_v1.0.yaml |  65 +++
 ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml |  83 ++++
 ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml |  66 +++
 egs/sre21-av-a/v1.8k/default_config.sh        |   2 +-
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  50 +--
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  48 +++
 ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh |  68 ----
 ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh |  58 +++
 ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh |  76 ----
 egs/sre21-av-a/v1.8k/run_011_train_xvector.sh |  54 ++-
 .../v1.8k/run_012_finetune_xvector.sh         |  61 ---
 egs/voxceleb/v1.1/run_002_compute_evad.sh     |   1 -
 egs/voxceleb/v1.2/run_001_prepare_data.sh     |  14 +-
 hyp_utils/create_data_link.pl                 | 132 ++++++
 hyp_utils/create_data_split_dirs.sh           |  46 +++
 hyp_utils/create_data_split_links.sh          |  23 ++
 hyp_utils/create_split_dir.pl                 |  92 +++++
 ...l_xvec_cosine_scoring_from_adv_test_wav.py |   8 +-
 ...osine_scoring_from_adv_test_wav_wavegan.py |  14 +-
 ...l_xvec_cosine_scoring_from_art_test_wav.py |   2 +-
 .../eval_xvec_cosine_scoring_from_test_wav.py |   4 +-
 ...sine_scoring_from_transfer_adv_test_wav.py |   2 +-
 ...sine_scoring_from_transfer_art_test_wav.py |   3 +-
 hyperion/bin/eval_xvec_logits_from_wav.py     |  48 +--
 hyperion/bin/extract_xvectors_from_wav.py     |   2 +-
 .../bin/extract_xvectors_slidwin_from_wav.py  |  14 +-
 hyperion/bin/finetune_xvector_from_wav.py     |  77 +---
 hyperion/data_prep/__init__.py                |   2 +-
 hyperion/data_prep/voxceleb1.py               | 338 ++++++++++++++++
 hyperion/data_prep/voxceleb2.py               |  12 +-
 hyperion/np/classifiers/__init__.py           |   2 +-
 hyperion/torch/layers/global_pool.py          |   7 +-
 hyperion/torch/models/xvectors/xvector.py     |   6 +-
 hyperion/torch/narchs/audio_feats_mvn.py      |   9 +-
 hyperion/torch/narchs/classif_head.py         |   2 +-
 hyperion/torch/narchs/dc1d_decoder.py         |   4 +-
 hyperion/torch/narchs/dc1d_encoder.py         |   4 +-
 hyperion/torch/narchs/dc2d_decoder.py         |   4 +-
 hyperion/torch/narchs/dc2d_encoder.py         |   4 +-
 hyperion/torch/narchs/fcnet.py                |   2 +-
 hyperion/torch/narchs/resnet.py               |  16 +-
 hyperion/torch/narchs/resnet1d_decoder.py     |  14 +-
 hyperion/torch/narchs/resnet1d_encoder.py     |  20 +-
 hyperion/torch/narchs/resnet2d_decoder.py     |  14 +-
 hyperion/torch/narchs/resnet2d_encoder.py     |  19 +-
 hyperion/torch/narchs/resnet_factory.py       |   4 +-
 hyperion/torch/narchs/spinenet.py             |  16 +-
 hyperion/torch/narchs/spinenet_factory.py     |   4 +-
 hyperion/torch/narchs/tdnn_factory.py         |   4 +-
 .../torch/narchs/transformer_encoder_v1.py    |   4 +-
 .../trainers/xvector_trainer_from_wav.py      |   8 +-
 hyperion/torch/utils/masking.py               |   7 +-
 hyperion/utils/__init__.py                    |   2 +
 hyperion/utils/dataset.py                     | 379 +++++++++++++++---
 hyperion/utils/enrollment_map.py              |  86 ++++
 hyperion/utils/info_table.py                  |  12 +-
 hyperion/utils/segment_set.py                 |   4 +
 hyperion/utils/sparse_trial_key.py            |  58 +++
 hyperion/utils/trial_key.py                   |  82 +++-
 85 files changed, 2625 insertions(+), 868 deletions(-)
 delete mode 100644 egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
 delete mode 100644 egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
 rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%)
 create mode 100644 egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
 rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%)
 delete mode 100755 egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
 create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
 delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
 create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
 delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
 delete mode 100755 egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh
 create mode 100755 hyp_utils/create_data_link.pl
 create mode 100755 hyp_utils/create_data_split_dirs.sh
 create mode 100755 hyp_utils/create_data_split_links.sh
 create mode 100755 hyp_utils/create_split_dir.pl
 create mode 100644 hyperion/data_prep/voxceleb1.py
 create mode 100644 hyperion/utils/enrollment_map.py

diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md
index e35577d7..0f5d09ad 100644
--- a/egs/sre21-av-a/v1.16k/README.md
+++ b/egs/sre21-av-a/v1.16k/README.md
@@ -88,8 +88,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs
 
    - `run_011_train_xvector.sh`
       - Trains the x-vector network on 4sec chunks
-
-   - `run_012_finetune_xvector.sh`
       - Fine-tune x-vector network on 10-15 secs utts
 
    - `run_013_prepare_langid_train_data.sh`
@@ -110,8 +108,8 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs
    - `run_040_eval_be_v1.sh, run_041_eval_be_v2.sh, run_042_eval_be_v3.sh, run_042b_eval_be_v3.sh`
       - Evals different back-end versions:
          - V1: Back-end trained on all data without adaptation
-	 - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, global PLDA adapted to SRE-Vox-CHN
-	 - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, source dependent PLDA adapted to SRE-CHN or Vox-CHN
+	 - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, global PLDA adapted to SRE-Vox-CHN
+	 - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, source dependent PLDA adapted to SRE-CHN or Vox-CHN
 	 - V3b: V3 with hyperparmeters tuned for x-vectors trained on VoxCeleb only
 
    - `run_fus*.sh`
@@ -120,4 +118,47 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs
 
 ## Results
 
-TODO
+The back-end used for these results is:
+- back-end V2 (run_041_eval_be_v2.sh)
+- Without S-Norm
+- Scores are calibrated as indicated in the paper.
+
+## SRE16 Eval40% YUE
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.57   | 0.135 | 0.237 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.23  | 0.136 | 0.187 | 
+| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.38   | 0.147 | 0.189 |
+
+## SRE-CTS Superset dev set
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | 
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | 
+| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 |
+
+## SRE-CTS Superset dev set
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | 
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | 
+| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 |
+
+## SRE21 Audio Dev (official scoring tool)
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 5.91 | 0.393 | 0.409 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 5.22 | 0.370 | 0.377 |
+| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 4.79 | 0.309 | 0.325 |
+
+## SRE21 Audio Eval (official scoring tool)
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 5.68 | 0.395 | 0.401 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 4.92 | 0.405 | 0.412 |
+| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 4.80 | 0.357 | 0.360 |
diff --git a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml
deleted file mode 100644
index 5451702f..00000000
--- a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-min_chunk_length: 4.0
-max_chunk_length: 4.0
-return_fullseqs: false
-wav_scale: 32767
-batch_size: 512
-var_batch_size: false
-iters_per_epoch: 6.0
-train_aug_cfg: conf/reverb_noise_aug.yaml
-val_aug_cfg: conf/reverb_noise_aug.yaml
-feats: fbank64_stmn_nb_16k.yaml
-pool_net:
-  pool_type: ch-wise-att-mean+stddev
-  inner_feats: 32
-embed_dim: 32
-num_embed_layers: 1
-hid_act: relu6
-loss_type: arc-softmax
-s: 30.0
-margin: 0.3
-margin_warmup_epochs: 30.0
-dropout_rate: 0.0
-in_feats: 64
-resnet_type: lresnet34
-in_channels: 1
-conv_channels: 64
-base_channels: 64
-in_kernel_size: 3
-in_stride: 1
-in_norm: false
-no_maxpool: true
-optim:
-  opt_type: adam
-  lr: 0.02
-  # lr: 0.01
-  beta1: 0.9
-  beta2: 0.95
-  amsgrad: true
-  weight_decay: 1e-5
-lrsched:
-  lrsch_type: exp_lr
-  decay_rate: 0.5
-  decay_steps: 8000
-  hold_steps: 10000
-  min_lr: 1.0e-05
-  warmup_steps: 1000
-  update_lr_on_opt_step: true
-grad_acc_steps: 1
-epochs: 70
-log_interval: 100
-use_tensorboard: false
-use_wandb: false
-wandb:
-  mode: online
-ddp_type: ddp
-use_amp: true
-swa_start: 0
-swa_lr: 0.001
-swa_anneal_epochs: 10
-num_gpus: 4
diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
index 01cfa082..d68ea26e 100644
--- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
+++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
@@ -68,6 +68,7 @@ model:
     multilayer_concat: true
     endpoint_channels: 8192
     dropout_rate: 0.0
+    hid_act: relu6
   pool_net:
     pool_type: ch-wise-att-mean+stddev
     inner_feats: 128
@@ -76,6 +77,7 @@ model:
   margin: 0.3
   margin_warmup_epochs: 20.0
   dropout_rate: 0.0
+  hid_act: relu6
 trainer:
   optim: 
     opt_type: adam
diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
index 24b1c081..e7f9969b 100644
--- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
+++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
@@ -46,7 +46,7 @@ model:
 trainer:
   optim:
     opt_type: sgd
-    lr: 1e-3
+    lr: 0.01
     momentum: 0.9
     weight_decay: 1e-5
   lrsched:
diff --git a/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml
new file mode 100644
index 00000000..c46365db
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml
@@ -0,0 +1,78 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_nb_16k.yaml
+model:
+  resnet_type: lresnet34
+  in_channels: 1
+  in_feats: 64
+  conv_channels: 64
+  in_kernel_size: 3
+  in_stride: 1
+  in_norm: false
+  no_maxpool: true
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 32
+  embed_dim: 32
+  num_embed_layers: 1
+  hid_act: relu6
+  loss_type: arc-softmax
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 30.0
+  dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: adam
+    lr: 0.02
+    beta1: 0.9
+    beta2: 0.95
+    amsgrad: true
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 8000
+    hold_steps: 10000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  epochs: 70
+  log_interval: 100
+  use_amp: true
+
diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..7a9234b6
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml
@@ -0,0 +1,80 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  resnet_type: res2net50
+  in_channels: 1
+  in_feats: 80
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 1.625
+  res2net_scale: 4
+  pool_net:
+    pool_type: mean+stddev
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.05
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 40000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 50
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 60
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..9884bb4c
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 21
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..4c427202
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
@@ -0,0 +1,82 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  resnet_type: res2net50
+  in_channels: 1
+  in_feats: 80
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 3.25
+  res2net_scale: 8
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+  hid_act: relu6
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 40000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..f34b4896
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..10607607
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
@@ -0,0 +1,83 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  resnet_type: tseres2net50
+  in_channels: 1
+  in_feats: 80
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 1.625
+  res2net_scale: 4
+  se_r: 256
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+  hid_act: relu6
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 40000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..f34b4896
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
index 1b7c3764..1da68697 100644
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
+++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -27,7 +27,7 @@ nnet_num_epochs=75
 nnet_dir=exp/xvector_nnets/$nnet_name
 nnet=$nnet_dir/model_ep0070.pth
 nnet=$nnet_dir/swa_model_ep0076.pth
-nnet=$nnet_dir/model_ep0004.pth
+
 # xvector full net finetuning with out-of-domain
 ft_eff_batch_size=128 # effective batch size
 ft_min_chunk=10
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
index 1903369e..6d14f27d 100644
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
+++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
@@ -1,4 +1,4 @@
-# LResNet34 x-vector with mixed precision training
+# Res2Net50 w26s4 x-vector with mixed precision training
 
 # acoustic features
 feat_config=conf/fbank80_stmn_16k.yaml
@@ -9,50 +9,33 @@ vad_config=conf/vad_16k.yaml
 
 # x-vector training 
 nnet_data=voxcelebcat
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
 
-batch_size_1gpu=24
 eff_batch_size=512 # effective batch size
-ipe=1
 min_chunk=4
 max_chunk=4
 lr=0.05
 
-nnet_type=res2net50 
+nnet_type=resnet
 dropout=0
 embed_dim=256
-width_factor=1.625
-scale=4
-ws_tag=w26s4
 
 s=30
 margin_warmup=20
 margin=0.3
 
-nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data
-nnet_num_epochs=60
+nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml
+nnet_name=${feat_type}_res2net50w26s4_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data
 nnet_dir=exp/xvector_nnets/$nnet_name
-nnet=$nnet_dir/model_ep0071.pth
-
+nnet=$nnet_dir/model_ep0061.pth
 
 # xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
 ft_eff_batch_size=128 # effective batch size
 ft_min_chunk=10
 ft_max_chunk=15
-ft_ipe=1
 ft_lr=0.01
-ft_nnet_num_epochs=21
 ft_margin=0.5
-ft_margin_warmup=5
 
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
+ft_nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml
 ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
 ft_nnet=$ft_nnet_dir/model_ep0021.pth
@@ -61,7 +44,4 @@ ft_nnet=$ft_nnet_dir/model_ep0021.pth
 plda_aug_config=conf/reverb_noise_aug.yaml
 plda_num_augs=0
 plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
 
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
deleted file mode 100644
index 344e1288..00000000
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-# LResNet34 x-vector with mixed precision training
-
-# acoustic features
-feat_config=conf/fbank80_stmn_16k.yaml
-feat_type=fbank80_stmn
-
-#vad
-vad_config=conf/vad_16k.yaml
-
-# x-vector training 
-nnet_data=voxcelebcat
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
-
-batch_size_1gpu=24
-eff_batch_size=512 # effective batch size
-ipe=1
-min_chunk=4
-max_chunk=4
-lr=0.02
-
-nnet_type=res2net50 
-dropout=0
-embed_dim=256
-width_factor=3.25
-scale=8
-ws_tag=w26s8
-
-s=30
-margin_warmup=20
-margin=0.3
-
-nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data
-nnet_num_epochs=60
-nnet_dir=exp/xvector_nnets/$nnet_name
-nnet=$nnet_dir/model_ep0070.pth
-#nnet=$nnet_dir/swa_model_ep0061.pth
-
-# xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
-ft_eff_batch_size=128 # effective batch size
-ft_min_chunk=10
-ft_max_chunk=10
-ft_ipe=1
-ft_lr=0.01
-ft_nnet_num_epochs=15
-ft_margin=0.5
-ft_margin_warmup=3
-
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
-ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
-ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
-ft_nnet=$ft_nnet_dir/model_ep0007.pth
-
-# back-end
-plda_aug_config=conf/reverb_noise_aug.yaml
-plda_num_augs=0
-plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
-
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
index cae32b57..0b62008e 100644
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
+++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -9,103 +9,40 @@ vad_config=conf/vad_16k.yaml
 
 # x-vector training 
 nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
 
-batch_size_1gpu=16
 eff_batch_size=512 # effective batch size
-ipe=1
 min_chunk=4
 max_chunk=4
 lr=0.02
 
-nnet_type=res2net50 
+nnet_type=resnet
 dropout=0
 embed_dim=256
-width_factor=3.25
-scale=8
-ws_tag=w26s8
 
 s=30
 margin_warmup=20
 margin=0.3
 attstats_inner=128
 
-nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
-nnet_num_epochs=75
+nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
+nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
 nnet_dir=exp/xvector_nnets/$nnet_name
 nnet=$nnet_dir/model_ep0070.pth
 nnet=$nnet_dir/swa_model_ep0076.pth
 
 # xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
 ft_eff_batch_size=128 # effective batch size
 ft_min_chunk=10
 ft_max_chunk=10
-ft_ipe=1
 ft_lr=0.01
-ft_nnet_num_epochs=15
 ft_margin=0.5
-ft_margin_warmup=3
 
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
+ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
 ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
 ft_nnet=$ft_nnet_dir/model_ep0007.pth
 
-
-# xvector last-layer finetuning in-domain
-reg_layers_classif=0
-reg_layers_enc="0 1 2 3 4"
-nnet_adapt_data=voxcelebcat_sre_alllangs_mixfs_chnspks
-
-# ft2_batch_size_1gpu=4
-# ft2_eff_batch_size=128 # effective batch size
-# ft2_ipe=4
-# ft2_lr=0.01
-# ft2_nnet_num_epochs=12
-# ft2_margin_warmup=3
-# ft2_reg_weight_embed=0.1
-# ft2_min_chunk=10
-# ft2_max_chunk=60
-
-# ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-# ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
-# ft2_nnet_name=${ft_nnet_name}.ft_eaffine_rege_w${ft2_reg_weigth_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v2
-# ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name
-# ft2_nnet=$ft2_nnet_dir/model_ep0010.pth
-
-
-# xvector full nnet finetuning
-ft2_batch_size_1gpu=6
-ft2_eff_batch_size=128 # effective batch size
-ft2_ipe=1
-ft2_lr=0.01
-ft2_nnet_num_epochs=15
-ft2_margin=0.5
-ft2_margin_warmup=3
-ft2_reg_weight_embed=0.1
-ft2_reg_weight_enc=0.1
-ft2_min_chunk=10
-ft2_max_chunk=10
-
-ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
-ft2_nnet_name=${ft_nnet_name}.ft_reg_wenc${ft2_reg_weight_enc}_we${ft2_reg_weight_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v1
-ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name
-ft2_nnet=$ft2_nnet_dir/model_ep0012.pth
-
-
 # back-end
 plda_aug_config=conf/reverb_noise_aug.yaml
 plda_num_augs=0
 plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
-
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
index 96475c53..a57f16d9 100644
--- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
+++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -9,21 +9,15 @@ vad_config=conf/vad_16k.yaml
 
 # x-vector training
 nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
 
-batch_size_1gpu=24
 eff_batch_size=512 # effective batch size
-ipe=1
 min_chunk=4
 max_chunk=4
 lr=0.02
 
-nnet_type=tseres2net50 
+nnet_type=resnet
 dropout=0
 embed_dim=256
-width_factor=1.625
-scale=4
-ws_tag=w26s4
 se_r=256
 
 s=30
@@ -31,13 +25,8 @@ margin_warmup=20
 margin=0.3
 attstats_inner=128
 
-nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
-nnet_num_epochs=75
+nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
+nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
 nnet_dir=exp/xvector_nnets/$nnet_name
 nnet=$nnet_dir/model_ep0075.pth
 nnet=$nnet_dir/swa_model_ep0076.pth
@@ -49,12 +38,9 @@ ft_min_chunk=10
 ft_max_chunk=15
 ft_ipe=1
 ft_lr=0.01
-ft_nnet_num_epochs=15
 ft_margin=0.5
-ft_margin_warmup=3
 
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
+ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
 ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
 ft_nnet=$ft_nnet_dir/model_ep0007.pth
@@ -69,7 +55,4 @@ else
     plda_data=voxceleb2cat_train_augx${plda_num_augs}
 fi
 plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
 
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh
similarity index 100%
rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh
rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh
diff --git a/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
new file mode 100644
index 00000000..b5863308
--- /dev/null
+++ b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh
@@ -0,0 +1,49 @@
+# Res2Net50 w26s8 x-vector with mixed precision training
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxcelebcat
+
+eff_batch_size=512 # effective batch size
+min_chunk=4
+max_chunk=4
+lr=0.02
+
+nnet_type=resnet
+dropout=0
+embed_dim=256
+
+s=30
+margin_warmup=20
+margin=0.3
+attstats_inner=128
+
+nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
+nnet_args="--model.pool_net.pool-type mean+stddev"
+nnet_name=${feat_type}_res2net50w26s8_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
+nnet_dir=exp/xvector_nnets/$nnet_name
+nnet=$nnet_dir/model_ep0070.pth
+#nnet=$nnet_dir/swa_model_ep0076.pth
+
+# xvector full net finetuning with out-of-domain
+ft_eff_batch_size=128 # effective batch size
+ft_min_chunk=10
+ft_max_chunk=10
+ft_lr=0.01
+ft_margin=0.5
+
+ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
+ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
+ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
+ft_nnet=$ft_nnet_dir/model_ep0007.pth
+
+# back-end
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+plda_type=splda
diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh
similarity index 100%
rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh
rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh
diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
index 7f405952..d7ea8ed0 100755
--- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
+++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh
@@ -19,7 +19,6 @@ num_workers=""
 
 list_dir=data/${nnet_data}_proc_audio_no_sil
 
-args=""
 if [ -n "$num_workers" ];then
     extra_args="--data.train.data_loader.num-workers $num_workers"
 fi
diff --git a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh
deleted file mode 100755
index 58a3fdc9..00000000
--- a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright
-#                2019   Johns Hopkins University (Author: Jesus Villalba)
-# Apache 2.0.
-#
-. ./cmd.sh
-. ./path.sh
-set -e
-
-stage=1
-ngpu=4
-config_file=default_config.sh
-resume=false
-interactive=false
-num_workers=3
-
-. parse_options.sh || exit 1;
-. $config_file
-. datapath.sh
-
-batch_size=$(($ft_batch_size_1gpu*$ngpu))
-grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}')
-log_interval=$(echo 100*$grad_acc_steps | bc)
-list_dir=data/${nnet_data}_proc_audio_no_sil
-
-args=""
-if [ "$resume" == "true" ];then
-    args="--resume"
-fi
-
-if [ "$interactive" == "true" ];then
-    export cuda_cmd=run.pl
-fi
-
-# Network Training
-if [ $stage -le 1 ]; then
-  mkdir -p $ft_nnet_dir/log
-  $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \
-      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-      torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \
-      --audio-path $list_dir/wav.scp \
-      --time-durs-file $list_dir/utt2dur \
-      --train-list $list_dir/lists_xvec/train.scp \
-      --val-list $list_dir/lists_xvec/val.scp \
-      --class-file $list_dir/lists_xvec/class2int \
-      --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \
-      --iters-per-epoch $ft_ipe \
-      --batch-size $batch_size \
-      --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \
-      --grad-acc-steps $grad_acc_steps \
-      --epochs $ft_nnet_num_epochs \
-      --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \
-      --num-gpus $ngpu \
-      --log-interval $log_interval \
-      --in-model-path $nnet \
-      --train-mode ft-full \
-      --exp-path $ft_nnet_dir $args
-
-fi
-
-
diff --git a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh
index 6251de97..35d2c0bc 100755
--- a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh
+++ b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh
@@ -10,19 +10,17 @@ set -e
 stage=1
 ngpu=4
 config_file=default_config.sh
-resume=false
 interactive=false
-num_workers=8
-lid_ipe=1
+num_workers=""
+
 . parse_options.sh || exit 1;
 . $config_file
 . datapath.sh
 
 list_dir=data/train_lid_proc_audio_no_sil
 
-args=""
-if [ "$resume" == "true" ];then
-    args="--resume"
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
 fi
 
 if [ "$interactive" == "true" ];then
@@ -33,22 +31,20 @@ lid_nnet_dir=exp/lid_nnets/lresnet34_lid_v1
 # Network Training
 if [ $stage -le 1 ]; then
 
-  train_exec=torch-train-resnet-xvec-from-wav.py
   mkdir -p $lid_nnet_dir/log
   $cuda_cmd \
     --gpu $ngpu $lid_nnet_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    $train_exec --cfg conf/lresnet34_lid_v1.yaml \
-    --audio-path $list_dir/wav.scp \
-    --time-durs-file $list_dir/utt2dur \
-    --train-list $list_dir/lists_train_lid/train.scp \
-    --val-list $list_dir/lists_train_lid/val.scp \
-    --class-file $list_dir/lists_train_lid/class2int \
-    --iters-per-epoch $lid_ipe \
-    --num-workers $num_workers \
-    --num-gpus $ngpu \
-    --exp-path $lid_nnet_dir $args
-
+    train_xvector_from_wav.py resnet \
+    --cfg conf/train_lresnet34_lid_v1.yaml \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
+    --data.train.dataset.time-durs-file $list_dir/utt2dur \
+    --data.train.dataset.segments-file $list_dir/lists_train_lid/train.scp \
+    --data.train.dataset.class-file $list_dir/lists_train_lid/class2int \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
+    --data.val.dataset.time-durs-file $list_dir/utt2dur \
+    --data.val.dataset.segments-file $list_dir/lists_train_lid/val.scp \
+    --trainer.exp-path $lid_nnet_dir $extra_args \
+    --num-gpus $ngpu
 fi
 
-exit
diff --git a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh
index 0941951f..73cb9a3d 100755
--- a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh
+++ b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh
@@ -195,7 +195,7 @@ if [ $stage -le 5 ]; then
     #SRE superset and 16
     echo "SRE Superset Dev"
     steps_be/eval_be_plda_snorm_v2_cts.sh \
-      --cmd "$train_cmd --mem 8G" \
+      --cmd "$train_cmd --mem 12G" \
       --plda_type $plda_type --ncoh $ncoh --num-parts 100 \
       data/sre_cts_superset_16k_dev/trials \
       data/sre_cts_superset_16k_dev/utt2enroll \
diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..bc311234
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
@@ -0,0 +1,104 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_8k.yaml
+model: 
+  resnet_enc:
+    in_feats: 64
+    in_conv_channels: 2048
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+    - 1
+    - 1
+    - 1
+    - 1
+    resb_channels:
+    - 2048
+    resb_kernel_sizes:
+    - 3
+    resb_dilations:
+    - 2
+    - 3
+    - 4
+    - 5
+    resb_strides:
+    - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 16
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 8192
+    dropout_rate: 0.0
+    hid_act: relu6
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+  hid_act: relu6
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 30000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..031e9ca3
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 15.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_8k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..416926d0
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
@@ -0,0 +1,82 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_8k.yaml
+model:
+  resnet_type: res2net50
+  in_channels: 1
+  in_feats: 64
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 3.25
+  res2net_scale: 8
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+  hid_act: relu6
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 35000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..16203033
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
@@ -0,0 +1,65 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_8k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
new file mode 100644
index 00000000..2d74799c
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
@@ -0,0 +1,83 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 24
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank64_stmn_8k.yaml
+model:
+  resnet_type: tseres2net50
+  in_channels: 1
+  in_feats: 64
+  in_kernel_size: 3
+  in_stride: 1
+  no_maxpool: true
+  res2net_width_factor: 1.625
+  res2net_scale: 4
+  se_r: 256
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 256
+  cos_scale: 30.0
+  margin: 0.3
+  margin_warmup_epochs: 20.0
+  dropout_rate: 0.0
+  hid_act: relu6
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.02
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.95
+    weight_decay: 1.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 10000
+    hold_steps: 40000
+    min_lr: 1.0e-05
+    warmup_steps: 1000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  swa_start: 65
+  swa_anneal_epochs: 5
+  swa_lr: 1e-3
+  use_amp: true
+  log_interval: 1000
+  epochs: 75
+  eff_batch_size: 512
diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
new file mode 100644
index 00000000..f34b4896
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
@@ -0,0 +1,66 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 8
+      max_chunk_length: 10.0
+      min_chunk_length: 10.0
+      num_chunks_per_seg_epoch: 1
+      class_name: class_id
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+feats: fbank80_stmn_16k.yaml
+model:
+  cos_scale: 30.0
+  margin: 0.5
+  margin_warmup_epochs: 3
+  intertop_margin: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 1e-5
+  lrsched:
+    lrsch_type: cos_lr
+    t: 2500
+    t_mul: 2
+    warm_restarts: true
+    gamma: 0.75
+    min_lr: 1e-4
+    warmup_steps: 100
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 7
+  eff_batch_size: 128
+
diff --git a/egs/sre21-av-a/v1.8k/default_config.sh b/egs/sre21-av-a/v1.8k/default_config.sh
index 91a20745..74b76b0a 120000
--- a/egs/sre21-av-a/v1.8k/default_config.sh
+++ b/egs/sre21-av-a/v1.8k/default_config.sh
@@ -1 +1 @@
-global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
\ No newline at end of file
+global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
\ No newline at end of file
diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
index 69ad025b..65c2c924 100644
--- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
+++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -9,53 +9,19 @@ vad_config=conf/vad_8k.yaml
 
 # x-vector training 
 nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
 
-batch_size_1gpu=16
 eff_batch_size=512 # effective batch size
-ipe=1
-min_chunk=4
-max_chunk=4
 lr=0.02
 
 nnet_type=resnet1d
-block_type=seres2bn # squeeze-excitation res2net bottleneck
-channels=2048
-ep_channels=8192
-width_factor=1
-scale=8
-se_r=16
 dropout=0
 
-attstats_inner=128
 embed_dim=256
 s=30
 margin_warmup=20
 margin=0.3
 
-nnet_opt="--resnet_enc.in-feats 64 \
-		     --resnet_enc.in-conv-channels $channels \
-		     --resnet_enc.in-kernel-size 5 \
-		     --resnet_enc.in-stride 1 \
-		     --resnet_enc.resb-type $block_type \
-		     --resnet_enc.resb-repeats 1 1 1 1 \
-		     --resnet_enc.resb-channels $channels \
-		     --resnet_enc.resb-kernel-sizes 3 \
-		     --resnet_enc.resb-dilations 2 3 4 5 \
-		     --resnet_enc.resb-strides 1 \
-		     --resnet_enc.res2net-width-factor $width_factor \
-		     --resnet_enc.res2net-scale $scale \
-		     --resnet_enc.se-r $se_r \
-		     --resnet_enc.multilayer \
-                     --resnet_enc.multilayer-concat \
-                     --resnet_enc.endpoint-channels $ep_channels \
-		     --pool_net.pool-type ch-wise-att-mean+stddev \
-		     --pool_net.inner-feats $attstats_inner \
-		     --embed-dim $embed_dim"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 30000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
+nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml
 nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
 nnet_num_epochs=75
 nnet_dir=exp/xvector_nnets/$nnet_name
@@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth
 nnet=$nnet_dir/swa_model_ep0076.pth
 
 # xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
 ft_eff_batch_size=128 # effective batch size
 ft_min_chunk=10
 ft_max_chunk=15
-ft_ipe=1
 ft_lr=0.01
 ft_nnet_num_epochs=15
 ft_margin=0.5
-ft_margin_warmup=3
 
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
+ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml
 ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
 ft_nnet=$ft_nnet_dir/model_ep0007.pth
@@ -82,8 +44,10 @@ ft_nnet=$ft_nnet_dir/model_ep0007.pth
 # back-end
 plda_aug_config=conf/reverb_noise_aug.yaml
 plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
 plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
 
diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
new file mode 100644
index 00000000..824361d0
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -0,0 +1,48 @@
+# Res2Net50 w26s8 x-vector with mixed precision training
+
+# acoustic features
+feat_config=conf/fbank64_stmn_8k.yaml
+feat_type=fbank64_stmn
+
+#vad
+vad_config=conf/vad_8k.yaml
+
+# x-vector training 
+nnet_data=voxcelebcat_sre_alllangs_mixfs
+
+eff_batch_size=512 # effective batch size
+min_chunk=4
+max_chunk=4
+lr=0.02
+
+nnet_type=resnet
+dropout=0
+embed_dim=256
+
+s=30
+margin_warmup=20
+margin=0.3
+attstats_inner=128
+
+nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml
+nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
+nnet_dir=exp/xvector_nnets/$nnet_name
+nnet=$nnet_dir/model_ep0070.pth
+nnet=$nnet_dir/swa_model_ep0076.pth
+
+# xvector full net finetuning with out-of-domain
+ft_eff_batch_size=128 # effective batch size
+ft_min_chunk=10
+ft_max_chunk=10
+ft_lr=0.01
+ft_margin=0.5
+
+ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml
+ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
+ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
+ft_nnet=$ft_nnet_dir/model_ep0007.pth
+
+# back-end
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+plda_type=splda
diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
deleted file mode 100644
index e1a923d7..00000000
--- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-# LResNet34 x-vector with mixed precision training
-
-# acoustic features
-feat_config=conf/fbank64_stmn_8k.yaml
-feat_type=fbank64_stmn
-
-#vad
-vad_config=conf/vad_8k.yaml
-
-# x-vector training 
-nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
-
-batch_size_1gpu=16
-eff_batch_size=512 # effective batch size
-ipe=1
-min_chunk=4
-max_chunk=4
-lr=0.02
-
-nnet_type=res2net50 
-dropout=0
-embed_dim=256
-width_factor=3.25
-scale=8
-ws_tag=w26s8
-
-s=30
-margin_warmup=20
-margin=0.3
-attstats_inner=128
-
-nnet_opt="--resnet-type $nnet_type --in-feats 64 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
-nnet_num_epochs=75
-nnet_dir=exp/xvector_nnets/$nnet_name
-nnet=$nnet_dir/model_ep0070.pth
-nnet=$nnet_dir/swa_model_ep0076.pth
-
-# xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
-ft_eff_batch_size=128 # effective batch size
-ft_min_chunk=10
-ft_max_chunk=10
-ft_ipe=1
-ft_lr=0.01
-ft_nnet_num_epochs=15
-ft_margin=0.5
-ft_margin_warmup=3
-
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
-ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
-ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
-ft_nnet=$ft_nnet_dir/model_ep0007.pth
-
-# back-end
-plda_aug_config=conf/reverb_noise_aug.yaml
-plda_num_augs=0
-plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
-
diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
new file mode 100644
index 00000000..58010842
--- /dev/null
+++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh
@@ -0,0 +1,58 @@
+# Time SE Res2Net50 w26s4 x-vector with mixed precision training
+
+# acoustic features
+feat_config=conf/fbank64_stmn_8k.yaml
+feat_type=fbank64_stmn
+
+#vad
+vad_config=conf/vad_8k.yaml
+
+# x-vector training
+nnet_data=voxcelebcat_sre_alllangs_mixfs
+
+eff_batch_size=512 # effective batch size
+min_chunk=4
+max_chunk=4
+lr=0.02
+
+nnet_type=resnet
+dropout=0
+embed_dim=256
+se_r=256
+
+s=30
+margin_warmup=20
+margin=0.3
+attstats_inner=128
+
+nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml
+nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
+nnet_dir=exp/xvector_nnets/$nnet_name
+nnet=$nnet_dir/model_ep0075.pth
+nnet=$nnet_dir/swa_model_ep0076.pth
+
+# xvector full net finetuning with out-of-domain
+ft_batch_size_1gpu=8
+ft_eff_batch_size=128 # effective batch size
+ft_min_chunk=10
+ft_max_chunk=10
+ft_ipe=1
+ft_lr=0.01
+ft_margin=0.5
+
+ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml
+ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
+ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
+ft_nnet=$ft_nnet_dir/model_ep0007.pth
+
+
+# back-end
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+
diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
deleted file mode 100644
index 9f5c8e70..00000000
--- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-# Time SE Res2Net50 w26s4 x-vector with mixed precision training
-
-# acoustic features
-feat_config=conf/fbank80_stmn_8k.yaml
-feat_type=fbank80_stmn
-
-#vad
-vad_config=conf/vad_8k.yaml
-
-# x-vector training
-nnet_data=voxcelebcat_sre_alllangs_mixfs
-aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml"
-
-batch_size_1gpu=24
-eff_batch_size=512 # effective batch size
-ipe=1
-min_chunk=4
-max_chunk=4
-lr=0.02
-
-nnet_type=tseres2net50 
-dropout=0
-embed_dim=256
-width_factor=1.625
-scale=4
-ws_tag=w26s4
-se_r=256
-
-s=30
-margin_warmup=20
-margin=0.3
-attstats_inner=128
-
-nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner"
-
-opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5"
-lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step"
-
-nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1
-nnet_num_epochs=75
-nnet_dir=exp/xvector_nnets/$nnet_name
-nnet=$nnet_dir/model_ep0075.pth
-nnet=$nnet_dir/swa_model_ep0076.pth
-
-# xvector full net finetuning with out-of-domain
-ft_batch_size_1gpu=8
-ft_eff_batch_size=128 # effective batch size
-ft_min_chunk=10
-ft_max_chunk=15
-ft_ipe=1
-ft_lr=0.01
-ft_nnet_num_epochs=21
-ft_nnet_num_epochs=45
-ft_margin=0.5
-ft_margin_warmup=3
-
-ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size"
-ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step"
-ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1
-ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name
-ft_nnet=$ft_nnet_dir/model_ep0014.pth
-
-
-# back-end
-plda_aug_config=conf/reverb_noise_aug.yaml
-plda_num_augs=0
-if [ $plda_num_augs -eq 0 ]; then
-    plda_data=voxceleb2cat_train
-else
-    plda_data=voxceleb2cat_train_augx${plda_num_augs}
-fi
-plda_type=splda
-# lda_dim=200
-# plda_y_dim=150
-# plda_z_dim=200
-
diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh
index 9891e812..d7ea8ed0 100755
--- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh
+++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh
@@ -10,22 +10,17 @@ set -e
 stage=1
 ngpu=4
 config_file=default_config.sh
-resume=false
 interactive=false
-num_workers=8
+num_workers=""
 
 . parse_options.sh || exit 1;
 . $config_file
 . datapath.sh
 
-batch_size=$(($batch_size_1gpu*$ngpu))
-grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}')
-log_interval=$(echo 100*$grad_acc_steps | bc)
 list_dir=data/${nnet_data}_proc_audio_no_sil
 
-args=""
-if [ "$resume" == "true" ];then
-    args="--resume"
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
 fi
 
 if [ "$interactive" == "true" ];then
@@ -35,6 +30,49 @@ fi
 # Network Training
 if [ $stage -le 1 ]; then
   
+  mkdir -p $nnet_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    train_xvector_from_wav.py $nnet_type \
+    --cfg $nnet_base_cfg $nnet_args $extra_args \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
+    --data.train.dataset.time-durs-file $list_dir/utt2dur \
+    --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
+    --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
+    --data.val.dataset.time-durs-file $list_dir/utt2dur \
+    --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
+    --trainer.exp-path $nnet_dir \
+    --num-gpus $ngpu \
+  
+fi
+
+# Large Margin Fine-tuning
+if [ $stage -le 2 ]; then
+  mkdir -p $ft_nnet_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $ft_nnet_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_xvector_from_wav.py $nnet_type \
+    --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \
+    --data.train.dataset.recordings-file $list_dir/wav.scp \
+    --data.train.dataset.time-durs-file $list_dir/utt2dur \
+    --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \
+    --data.train.dataset.class-files $list_dir/lists_xvec/class2int \
+    --data.val.dataset.recordings-file $list_dir/wav.scp \
+    --data.val.dataset.time-durs-file $list_dir/utt2dur \
+    --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \
+    --in-model-file $nnet \
+    --trainer.exp-path $ft_nnet_dir \
+    --num-gpus $ngpu \
+  
+fi
+exit
+
+# Network Training
+if [ $stage -le 1 ]; then
+
   if [[ ${nnet_type} =~ resnet1d ]]; then
     train_exec=torch-train-resnet1d-xvec-from-wav.py
   elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then
diff --git a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh
deleted file mode 100755
index 58a3fdc9..00000000
--- a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright
-#                2019   Johns Hopkins University (Author: Jesus Villalba)
-# Apache 2.0.
-#
-. ./cmd.sh
-. ./path.sh
-set -e
-
-stage=1
-ngpu=4
-config_file=default_config.sh
-resume=false
-interactive=false
-num_workers=3
-
-. parse_options.sh || exit 1;
-. $config_file
-. datapath.sh
-
-batch_size=$(($ft_batch_size_1gpu*$ngpu))
-grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}')
-log_interval=$(echo 100*$grad_acc_steps | bc)
-list_dir=data/${nnet_data}_proc_audio_no_sil
-
-args=""
-if [ "$resume" == "true" ];then
-    args="--resume"
-fi
-
-if [ "$interactive" == "true" ];then
-    export cuda_cmd=run.pl
-fi
-
-# Network Training
-if [ $stage -le 1 ]; then
-  mkdir -p $ft_nnet_dir/log
-  $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \
-      hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-      torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \
-      --audio-path $list_dir/wav.scp \
-      --time-durs-file $list_dir/utt2dur \
-      --train-list $list_dir/lists_xvec/train.scp \
-      --val-list $list_dir/lists_xvec/val.scp \
-      --class-file $list_dir/lists_xvec/class2int \
-      --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \
-      --iters-per-epoch $ft_ipe \
-      --batch-size $batch_size \
-      --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \
-      --grad-acc-steps $grad_acc_steps \
-      --epochs $ft_nnet_num_epochs \
-      --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \
-      --num-gpus $ngpu \
-      --log-interval $log_interval \
-      --in-model-path $nnet \
-      --train-mode ft-full \
-      --exp-path $ft_nnet_dir $args
-
-fi
-
-
diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh
index 7a2a9be5..4e82a87a 100755
--- a/egs/voxceleb/v1.1/run_002_compute_evad.sh
+++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh
@@ -41,7 +41,6 @@ if [ $stage -le 1 ]; then
   fi
 fi
 
-#Train datasets
 if [ $stage -le 2 ];then
   if [ "$do_voxsrc22" == "true" ];then
     extra_data="voxsrc22_dev"
diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh
index 831eb1bc..f956bc8c 100755
--- a/egs/voxceleb/v1.2/run_001_prepare_data.sh
+++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh
@@ -20,17 +20,17 @@ if [ $stage -le 1 ];then
     prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
     --cat-videos --use-kaldi-ids \
     --output-dir data/voxceleb2cat_train
-  #local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
 fi
-exit
+
 if [ $stage -le 2 ];then
   # prepare voxceleb1 for test
-  # This script is for the old version of the dataset
-  # local/make_voxceleb1_oeh.pl $voxceleb1_root data
-  # Use this for the newer version of voxceleb1:
-  local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
+  #hyp_utils/conda_env.sh \
+    prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \
+    --use-kaldi-ids \
+    --output-dir data/voxceleb1_test
+  #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
 fi
-
+exit
 if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
   local/prepare_voxsrc22_dev.py \
     --vox1-corpus-dir $voxceleb1_root \
diff --git a/hyp_utils/create_data_link.pl b/hyp_utils/create_data_link.pl
new file mode 100755
index 00000000..850f29f0
--- /dev/null
+++ b/hyp_utils/create_data_link.pl
@@ -0,0 +1,132 @@
+#!/usr/bin/env perl
+
+# Copyright 2013  Guoguo Chen
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0.
+#
+# This script distributes data onto different file systems by making symbolic
+# links. It is supposed to use together with utils/create_split_dir.pl, which
+# creates a "storage" directory that links to different file systems.
+#
+# If a sub-directory egs/storage does not exist, it does nothing. If it exists,
+# then it selects pseudo-randomly a number from those available in egs/storage/*
+# creates a link such as
+#
+#   egs/egs.3.4.ark -> storage/4/egs.3.4.ark
+#
+use strict;
+use warnings;
+use File::Basename;
+use File::Spec;
+use Getopt::Long;
+
+sub GetGCD {
+  my ($a, $b) = @_;
+  while ($a != $b) {
+    if ($a > $b) {
+      $a = $a - $b;
+    } else {
+      $b = $b - $a;
+    }
+  }
+  return $a;
+}
+
+my $Usage = <<EOU;
+create_data_link.pl:
+This script distributes data onto different file systems by making symbolic
+links. It is supposed to use together with utils/create_split_dir.pl, which
+creates a "storage" directory that links to different file systems.
+
+If a sub-directory foo/storage does not exist, it does nothing. If it exists,
+then it selects pseudo-randomly a number from those available in foo/storage/*
+creates a link such as
+
+  foo/egs.3.4.ark -> storage/4/egs.3.4.ark
+
+Usage: utils/create_data_link.pl <data-archive1> [<data-archive2> ... ]
+ e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark
+ (note: the dirname, e.g. foo/bar/, must be the same in all cases).
+
+See also utils/remove_data_links.sh
+EOU
+
+GetOptions();
+
+if (@ARGV == 0) {
+  die $Usage;
+}
+
+my $example_fullpath = $ARGV[0];
+
+# Check if the storage has been created. If so, do nothing.
+my $dirname = dirname($example_fullpath);
+if (! -d "$dirname/storage") {
+  exit(0);
+}
+
+# Storage exists, create symbolic links in the next few steps.
+
+# First, get a list of the available storage directories, and check if they are
+# properly created.
+opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n";
+my @storage_dirs = grep(/^[0-9]*$/, readdir($dh));
+closedir($dh);
+my $num_storage = scalar(@storage_dirs);
+for (my $x = 1; $x <= $num_storage; $x++) {
+  (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n";
+}
+
+# Second, get the coprime list.
+my @coprimes;
+for (my $n = 1; $n <= $num_storage; $n++) {
+  if (GetGCD($n, $num_storage) == 1) {
+    push(@coprimes, $n);
+  }
+}
+
+my $ret = 0;
+
+foreach my $fullpath (@ARGV) {
+  if ($dirname ne dirname($fullpath)) {
+    die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath";
+  }
+
+  # Finally, work out the directory index where we should put the data to.
+  my $basename = basename($fullpath);
+  my $filename_numbers = $basename;
+  $filename_numbers =~ s/[^0-9]+/ /g;
+  my @filename_numbers = split(" ", $filename_numbers);
+  my $total = 0;
+  my $index = 0;
+  foreach my $x (@filename_numbers) {
+    if ($index >= scalar(@coprimes)) {
+      $index = 0;
+    }
+    $total += $x * $coprimes[$index];
+    $index++;
+  }
+  my $dir_index = $total % $num_storage + 1;
+
+  # Make the symbolic link.
+  if (-e $fullpath) {
+    unlink($fullpath);
+  }
+  if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure
+    $ret = 1;  # will exit with error status.
+  }
+}
+
+exit($ret);
+
+## testing:
+# rm -rf foo bar
+# mkdir -p bar/{1,2,3,4}
+# mkdir -p foo/storage
+# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done
+# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark  foo/2.3.ark
+# ls -l foo
+# total 0
+# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 1.3.ark -> storage/3/1.3.ark
+# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 2.3.ark -> storage/4/2.3.ark
+# drwxr-xr-x 2 dpovey fax 38 Sep  2 17:40 storage
diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh
new file mode 100755
index 00000000..877b9e3f
--- /dev/null
+++ b/hyp_utils/create_data_split_dirs.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright
+#                2023   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+# Creates links to distrubute data into multiple nodes in clsp grid
+
+storage_name=$(date +'%m_%d_%H_%M')
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <output-dir> <storage-dir> <nodes>"
+  echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0"
+fi
+output_dir=$1
+storage_dir=$2
+nodes=$3
+
+link_dir=$output_dir/storage
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then
+  echo "Prepare to distribute data over multiple $nodes nodes"
+  dir_name=$storage_dir/$storage_name/storage
+  if [ "$nodes" == "b0" ];then
+    utils/create_split_dir.pl \
+      hyp_utils/create_split_dir.pl \
+      /export/b{04,05,06,07}/$dir_name $link_dir
+  elif [ "$nodes" == "b1" ];then
+    hyp_utils/create_split_dir.pl \
+      /export/b{14,15,16,17}/$dir_name $link_dir
+  elif [ "$nodes" == "c0" ];then
+    hyp_utils/create_split_dir.pl \
+      /export/c{06,07,08,09}/$dir_name $link_dir
+  elif [ "$nodes" == "fs01" ];then
+    hyp_utils/create_split_dir.pl \
+      /export/fs01/$dir_name $link_dir
+  else
+    echo "we don't distribute data between multiple machines"
+  fi
+fi
+
+
+
diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh
new file mode 100755
index 00000000..fb5b8ca0
--- /dev/null
+++ b/hyp_utils/create_data_split_links.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright
+#                2023   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+# Creates links to distrubute data into multiple nodes in clsp grid
+
+storage_name=$(date +'%m_%d_%H_%M')
+
+echo "$0 $@"  # Print the command line for logging
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <output-file-pattern> < <num-jobs>"
+  echo "$0 exp/vad_dir/vad.JOB.ark 40"
+fi
+output_file_pattern=$1
+nj=$2
+
+for n in $(seq $nj); do
+  # the next command does nothing unless output_dir/storage exists, see
+  # utils/create_data_link.pl for more info.
+  output_file=$(echo $output_file_pattern | sed 's@\.JOB\.[^\.]*$@.'$n'.@')
+  hyp_utils/create_data_link.pl $output_file
+done
+
diff --git a/hyp_utils/create_split_dir.pl b/hyp_utils/create_split_dir.pl
new file mode 100755
index 00000000..ab952357
--- /dev/null
+++ b/hyp_utils/create_split_dir.pl
@@ -0,0 +1,92 @@
+#!/usr/bin/env perl
+
+# Copyright 2013  Guoguo Chen
+# Apache 2.0.
+#
+# This script creates storage directories on different file systems, and creates
+# symbolic links to those directories. For example, a command
+#
+#   utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage
+#
+# will mkdir -p all of those directories, and will create links
+#
+#   egs/storage/1 -> /export/gpu-03/egs/storage
+#   egs/storage/2 -> /export/gpu-03/egs/storage
+#   ...
+#
+use strict;
+use warnings;
+use File::Spec;
+use Getopt::Long;
+
+my $Usage = <<EOU;
+create_split_dir.pl:
+This script creates storage directories on different file systems, and creates
+symbolic links to those directories.
+
+Usage: utils/create_split_dir.pl <actual_storage_dirs> <pseudo_storage_dir>
+ e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage
+
+Allowed options:
+  --suffix    : Common suffix to <actual_storage_dirs>    (string, default = "")
+
+See also create_data_link.pl, which is intended to work with the resulting
+directory structure, and remove_data_links.sh
+EOU
+
+my $suffix="";
+GetOptions('suffix=s' => \$suffix);
+
+if (@ARGV < 2) {
+  die $Usage;
+}
+
+my $ans = 1;
+
+my $dir = pop(@ARGV);
+system("mkdir -p $dir 2>/dev/null");
+
+my @all_actual_storage = ();
+foreach my $file (@ARGV) {
+  push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix);
+}
+
+my $index = 1;
+foreach my $actual_storage (@all_actual_storage) {
+  my $pseudo_storage = "$dir/$index";
+
+  # If the symbolic link already exists, delete it.
+  if (-l $pseudo_storage) {
+    print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n";
+    $index++;
+    next;
+  }
+
+  # Create the destination directory and make the link.
+  system("mkdir -p $actual_storage 2>/dev/null");
+  if ($? != 0) {
+    print STDERR "$0: error creating directory $actual_storage\n";
+    exit(1);
+  }
+  { # create a README file for easier deletion.
+    open(R, ">$actual_storage/README.txt");
+    my $storage_dir = File::Spec->rel2abs($dir);
+    print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n";
+    print R "# The full list of directories where this data resides is:\n";
+    foreach my $d (@all_actual_storage) {
+      print R "$d\n";
+    }
+    close(R);
+  }
+  my $ret = symlink($actual_storage, $pseudo_storage);
+
+  # Process the returned values
+  $ans = $ans && $ret;
+  if (! $ret) {
+    print STDERR "Error linking $actual_storage to $pseudo_storage\n";
+  }
+
+  $index++;
+}
+
+exit($ans == 1 ? 0 : 1);
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
index 10ea491c..50fd5088 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
@@ -49,11 +49,11 @@ def __init__(
         self.sigma = sigma
 
     def forward(self, s_t):
-        # print('sigma0=', self.sigma)
+
         if self.sigma > 0:
             s_t = s_t + self.sigma * torch.randn_like(s_t)
-            # print('sigma1=', self.sigma)
-        f_t = self.feat_extractor(s_t)
+
+        f_t, _ = self.feat_extractor(s_t)
         if self.vad_t is not None:
             n_vad_frames = len(self.vad_t)
             n_feat_frames = f_t.shape[1]
@@ -320,7 +320,7 @@ def eval_cosine_scoring(
     )
     s.save_txt(score_file)
 
-    logging.info("saving stats to %s" % (stats_file))
+    logging.info("saving stats to %s", stats_file)
     attack_stats.to_csv(stats_file)
 
 
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
index a6f535b3..5697404d 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
@@ -84,7 +84,7 @@ def forward(self, s_t):
         s_t = self.wav_scale * s_t
         # End of pre-processing defense
 
-        f_t = self.feat_extractor(s_t)
+        f_t, _ = self.feat_extractor(s_t)
         if self.vad_t is not None:
             n_vad_frames = len(self.vad_t)
             n_feat_frames = f_t.shape[1]
@@ -289,13 +289,11 @@ def eval_cosine_scoring_wavegan(
             vad = torch.tensor(vad, dtype=torch.bool).to(device)
             model.vad_t = vad
             logging.info(
-                "utt %s detected %d/%d (%.2f %%) speech frames"
-                % (
-                    key.seg_set[j],
-                    speech_frames,
-                    tot_frames,
-                    speech_frames / tot_frames * 100,
-                )
+                "utt %s detected %d/%d (%.2f %%) speech frames",
+                key.seg_set[j],
+                speech_frames,
+                tot_frames,
+                speech_frames / tot_frames * 100,
             )
 
         t2 = time.time()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
index 5ba42477..0ca1f740 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
@@ -113,7 +113,7 @@ def forward(self, s_t):
             s_t = s_t[0, 0]
 
         f_t = s_t
-        f_t = self.feat_extractor(s_t)
+        f_t, _ = self.feat_extractor(s_t)
         if self.vad_t is not None:
             n_vad_frames = len(self.vad_t)
             n_feat_frames = f_t.shape[1]
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
index c3732bd3..49a762af 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
@@ -125,7 +125,7 @@ def eval_cosine_scoring(
     audio_reader = AR(test_wav_file, **audio_args)
 
     if vad_spec is not None:
-        logging.info("opening VAD stream: %s" % (vad_spec))
+        logging.info("opening VAD stream: %s", vad_spec)
         v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32")
@@ -144,7 +144,7 @@ def eval_cosine_scoring(
 
             t2 = time.time()
             s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device)
-            x_t = feat_extractor(s)
+            x_t, _ = feat_extractor(s)
             t4 = time.time()
             tot_frames = x_t.shape[1]
             if vad_spec is not None:
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
index c00cf286..b2c111ca 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
@@ -45,7 +45,7 @@ def __init__(
 
     def forward(self, s_t):
         f_t = s_t
-        f_t = self.feat_extractor(s_t)
+        f_t, _ = self.feat_extractor(s_t)
         if self.vad_t is not None:
             n_vad_frames = len(self.vad_t)
             n_feat_frames = f_t.shape[1]
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
index 4f2b82ab..8b6c8dae 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
@@ -59,8 +59,7 @@ def __init__(
         self.threshold = threshold
 
     def forward(self, s_t):
-        f_t = s_t
-        f_t = self.feat_extractor(s_t)
+        f_t, _ = self.feat_extractor(s_t)
         if self.vad_t is not None:
             n_vad_frames = len(self.vad_t)
             n_feat_frames = f_t.shape[1]
diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py
index 2f5cf3da..98ba76b5 100755
--- a/hyperion/bin/eval_xvec_logits_from_wav.py
+++ b/hyperion/bin/eval_xvec_logits_from_wav.py
@@ -137,7 +137,7 @@ def eval_xvec(
         with AR(input_spec, **ar_args) as reader:
 
             if vad_spec is not None:
-                logging.info("opening VAD stream: %s" % (vad_spec))
+                logging.info("opening VAD stream: %s", vad_spec)
                 v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
@@ -160,7 +160,7 @@ def eval_xvec(
                             x[None, :], dtype=torch.get_default_dtype()
                         ).to(device)
 
-                        x = feat_extractor(x)
+                        x, _ = feat_extractor(x)
                         t5 = time.time()
                         tot_frames = x.shape[1]
                         if vad_spec is not None:
@@ -169,13 +169,11 @@ def eval_xvec(
                             x = x[:, vad]
 
                         logging.info(
-                            "utt %s detected %d/%d (%.2f %%) speech frames"
-                            % (
-                                key,
-                                x.shape[1],
-                                tot_frames,
-                                x.shape[1] / tot_frames * 100,
-                            )
+                            "utt %s detected %d/%d (%.2f %%) speech frames",
+                            key,
+                            x.shape[1],
+                            tot_frames,
+                            x.shape[1] / tot_frames * 100,
                         )
 
                         if random_utt_length:
@@ -200,27 +198,23 @@ def eval_xvec(
                     read_time = t2 - t1
                     tot_time = read_time + t8 - t3
                     logging.info(
-                        (
-                            "utt %s total-time=%.3f read-time=%.3f "
-                            "aug-time=%.3f feat-time=%.3f "
-                            "vad-time=%.3f embed-time=%.3f write-time=%.3f "
-                            "rt-factor=%.2f"
-                        )
-                        % (
-                            key,
-                            tot_time,
-                            read_time,
-                            t4 - t3,
-                            t5 - t4,
-                            t6 - t5,
-                            t7 - t6,
-                            t8 - t7,
-                            x0.shape[0] / fs[0] / tot_time,
-                        )
+                        "utt %s total-time=%.3f read-time=%.3f "
+                        "aug-time=%.3f feat-time=%.3f "
+                        "vad-time=%.3f embed-time=%.3f write-time=%.3f "
+                        "rt-factor=%.2f",
+                        key,
+                        tot_time,
+                        read_time,
+                        t4 - t3,
+                        t5 - t4,
+                        t6 - t5,
+                        t7 - t6,
+                        t8 - t7,
+                        x0.shape[0] / fs[0] / tot_time,
                     )
 
     if write_num_frames_spec is not None:
-        logging.info("writing num-frames to %s" % (write_num_frames_spec))
+        logging.info("writing num-frames to %s", write_num_frames_spec)
         u2nf = Utt2Info.create(keys, info)
         u2nf.save(write_num_frames_spec)
 
diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py
index 1da1ac05..f49a5fb0 100755
--- a/hyperion/bin/extract_xvectors_from_wav.py
+++ b/hyperion/bin/extract_xvectors_from_wav.py
@@ -163,7 +163,7 @@ def extract_xvectors(
                             x[None, :], dtype=torch.get_default_dtype()
                         ).to(device)
 
-                        x = feat_extractor(x)
+                        x, _ = feat_extractor(x)
                         t5 = time.time()
                         tot_frames = x.shape[1]
                         if vad_spec is not None:
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
index a31bd614..9dc0aa2c 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
@@ -155,7 +155,7 @@ def extract_xvectors(
                             x[None, :], dtype=torch.get_default_dtype()
                         ).to(device)
 
-                        x = feat_extractor(x)
+                        x, _ = feat_extractor(x)
                         t5 = time.time()
                         tot_frames = x.shape[1]
                         if vad_spec is not None:
@@ -164,13 +164,11 @@ def extract_xvectors(
                             x = x[:, vad]
 
                         logging.info(
-                            "utt %s detected %d/%d (%.2f %%) speech frames"
-                            % (
-                                key,
-                                x.shape[1],
-                                tot_frames,
-                                x.shape[1] / tot_frames * 100,
-                            )
+                            "utt %s detected %d/%d (%.2f %%) speech frames",
+                            key,
+                            x.shape[1],
+                            tot_frames,
+                            x.shape[1] / tot_frames * 100,
                         )
 
                         t6 = time.time()
diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py
index 227892ea..7d602709 100755
--- a/hyperion/bin/finetune_xvector_from_wav.py
+++ b/hyperion/bin/finetune_xvector_from_wav.py
@@ -10,8 +10,12 @@
 import time
 from pathlib import Path
 
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -239,72 +243,3 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
-
-
-# if __name__ == "__main__":
-
-#     parser = ArgumentParser(description="Fine-tune x-vector model from audio files")
-#     parser.add_argument("--cfg", action=ActionConfigFile)
-
-#     train_parser = ArgumentParser(prog="")
-#     AD.add_class_args(train_parser, prefix="dataset", skip={})
-#     Sampler.add_class_args(train_parser, prefix="sampler")
-#     train_parser.add_argument(
-#         "--data_loader.num-workers",
-#         type=int,
-#         default=5,
-#         help="num_workers of data loader",
-#     )
-
-#     val_parser = ArgumentParser(prog="")
-#     AD.add_class_args(val_parser, prefix="dataset", skip={})
-#     Sampler.add_class_args(val_parser, prefix="sampler")
-#     val_parser.add_argument(
-#         "--data_loader.num-workers",
-#         type=int,
-#         default=5,
-#         help="num_workers of data loader",
-#     )
-#     data_parser = ArgumentParser(prog="")
-#     data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
-#     data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
-#     parser.add_argument("--data", action=ActionParser(parser=data_parser))
-#     parser.link_arguments(
-#         "data.train.dataset.class_file", "data.val.dataset.class_file"
-#     )
-#     parser.link_arguments(
-#         "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
-#     )
-#     parser.link_arguments(
-#         "data.train.sampler.batch_size", "data.val.sampler.batch_size"
-#     )
-
-#     AF.add_class_args(parser, prefix="feats")
-#     parser.add_argument("--in-model-path", required=True)
-
-#     XVec.add_finetune_args(parser, prefix="model")
-#     Trainer.add_class_args(
-#         parser, prefix="trainer", train_modes=XVec.valid_train_modes()
-#     )
-#     ddp.add_ddp_args(parser)
-
-#     parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
-#     parser.add_argument(
-#         "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
-#     )
-#     parser.add_argument("--local_rank", default=0, type=int)
-
-#     args = parser.parse_args()
-#     gpu_id = args.local_rank
-#     del args.local_rank
-
-#     if gpu_id == 0:
-#         try:
-#             config_file = Path(args.exp_path) / "config.yaml"
-#             parser.save(args, str(config_file), format="yaml", overwrite=True)
-#         except:
-#             pass
-
-#     # torch docs recommend using forkserver
-#     multiprocessing.set_start_method("forkserver")
-#     train_xvec(gpu_id, args)
diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py
index 7caae8c4..9ae59246 100644
--- a/hyperion/data_prep/__init__.py
+++ b/hyperion/data_prep/__init__.py
@@ -3,6 +3,6 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-# from .data_prep import data_prep_registry
 from .data_prep import DataPrep
 from .voxceleb2 import VoxCeleb2DataPrep
+from .voxceleb1 import VoxCeleb1DataPrep
diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py
new file mode 100644
index 00000000..00b2e380
--- /dev/null
+++ b/hyperion/data_prep/voxceleb1.py
@@ -0,0 +1,338 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import re
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import glob
+
+import numpy as np
+import pandas as pd
+from jsonargparse import ActionYesNo
+from tqdm import tqdm
+
+from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet
+from ..utils.misc import PathLike, urlretrieve_progress
+from .data_prep import DataPrep
+
+
+class VoxCeleb1DataPrep(DataPrep):
+    """Class for preparing VoxCeleb1 database into tables,
+       It prepares the full voxceleb either to train or test with
+       Original/Entire/Hard.
+       We don't consider preparing dev for train and test for test Original
+
+    Attributes:
+      corpus_dir: input data directory
+      task: train/test
+      cat_videos: concatenate utterances from the same video.
+      output_dir: output data directory
+      use_kaldi_ids: puts speaker-id in front of segment id like kaldi
+      target_sample_freq: target sampling frequency to convert the audios to.
+    """
+
+    def __init__(
+        self,
+        corpus_dir: PathLike,
+        task: str,
+        cat_videos: bool,
+        output_dir: PathLike,
+        use_kaldi_ids: bool,
+        target_sample_freq: int,
+        num_threads: int = 10,
+    ):
+        use_kaldi_ids = True
+        super().__init__(
+            corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads
+        )
+
+        self.task = task
+        assert (
+            cat_videos == False or task == "train"
+        ), "cat-videos is only available for train task"
+
+        self.cat_videos = cat_videos
+
+    @staticmethod
+    def dataset_name():
+        return "voxceleb1"
+
+    @staticmethod
+    def add_class_args(parser):
+        DataPrep.add_class_args(parser)
+        parser.add_argument(
+            "--task",
+            default="test",
+            choices=["test", "train"],
+            help="""if we prepare the data for [test, train]""",
+        )
+        parser.add_argument(
+            "--cat-videos",
+            default=False,
+            action=ActionYesNo,
+            help="""concatenate utterances from the same video.""",
+        )
+
+    def _get_metadata(self):
+        file_name = "vox1_meta.csv"
+        file_path = self.corpus_dir / file_name
+        if not file_path.exists():
+            file_path = self.output_dir / file_name
+            if not file_path.exists():
+                url = "https://www.openslr.org/resources/49/vox1_meta.csv"
+                file_path, _ = urlretrieve_progress(url, file_path, desc=file_name)
+
+        df_meta = pd.read_csv(file_path, sep="\t")
+        df_meta.rename(columns=str.strip, inplace=True)
+        df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x)
+        df_meta.set_index("VoxCeleb1 ID", inplace=True)
+        return df_meta
+
+    def _get_langs_est(self):
+        file_name = "lang_vox2_final.csv"
+        file_path = self.corpus_dir / file_name
+        if not file_path.exists():
+            file_path = self.output_dir / file_name
+            if not file_path.exists():
+                url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"
+                file_path, _ = urlretrieve_progress(url, file_path, desc=file_name)
+
+        df_lang = pd.read_csv(file_path, sep=",")
+
+        if self.cat_videos:
+
+            def get_video(x):
+                x = re.sub("/[^/]*.wav$", "", x)
+                return re.sub("/", "-", x)
+
+        elif self.use_kaldi_ids:
+
+            def get_video(x):
+                x = re.sub(".wav$", "", x)
+                return re.sub("/", "-", x)
+
+        else:
+
+            def get_video(x):
+                x = re.sub(".wav$", "", x)
+                x = re.sub("^[^/]*/", "", x)
+                return re.sub("/", "-", x)
+
+        df_lang["id"] = df_lang["filename"].apply(get_video)
+        df_lang.drop(["filename"], axis=1, inplace=True)
+        df_lang.drop_duplicates(inplace=True)
+        df_lang.set_index("id", inplace=True)
+        df_lang["lang"] = df_lang["lang"].apply(str.lower)
+        return df_lang
+
+    @staticmethod
+    def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i):
+        list_file = lists_cat_dir / f"{rec_id}.txt"
+        with open(list_file, "w") as fw:
+            rec_idx = (video_idx == i).nonzero()[0]
+            recs_i = [f"file {rec_files[j]}" for j in rec_idx]
+            recs_i.sort()
+            recs_i = "\n".join(recs_i)
+            fw.write(f"{recs_i}\n")
+
+        file_path = (
+            f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|"
+        )
+        return file_path
+
+    def make_trials(self):
+        url_base = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"
+        trials_file_names = [
+            "veri_test2.txt",
+            "list_test_hard2.txt",
+            "list_test_all2.txt",
+        ]
+        trials_names = ["trials_o", "trials_h", "trials_e"]
+
+        trials = {}
+        dfs = []
+        logging.info("making trials")
+        for trial_name, file_name in zip(trials_names, trials_file_names):
+            file_path = self.corpus_dir / file_name
+            if not file_path.exists():
+                file_path = self.output_dir / file_name
+                if not file_path.exists():
+                    url = f"{url_base}/{file_name}"
+                    file_path, _ = urlretrieve_progress(url, file_path, desc=file_name)
+
+            df_in = pd.read_csv(
+                file_path,
+                header=None,
+                sep=" ",
+                names=["key", "enroll_file", "test_file"],
+            )
+            key = ["target" if k == 1 else "nontarget" for k in df_in["key"]]
+
+            def get_modelid(s):
+                s = re.sub(r"\.wav", "", s)
+                return re.sub(r"/", "-", s)
+
+            if self.use_kaldi_ids:
+                get_segmentid = get_modelid
+            else:
+
+                def get_segmentid(s):
+                    s = get_modelid(s)
+                    return re.sub(r"[^-]*-", "", s)
+
+            modelid = [get_modelid(f) for f in df_in["enroll_file"]]
+            segmentid = [get_segmentid(f) for f in df_in["test_file"]]
+            df_out = pd.DataFrame(
+                {"modelid": modelid, "segmentid": segmentid, "targettype": key}
+            )
+            df_out.sort_values(by=["modelid", "segmentid"], inplace=True)
+            file_path = self.output_dir / f"{trial_name}.csv"
+            df_out.to_csv(file_path, index=False)
+            dfs.append(df_out)
+            trials[trial_name] = file_path
+
+        df_out = pd.concat(dfs, ignore_index=True)
+        df_out.sort_values(by=["modelid", "segmentid"], inplace=True)
+        file_path = self.output_dir / "trials.csv"
+        df_out.to_csv(file_path, index=False)
+        trials["trials"] = file_path
+
+        logging.info("making enrollment map")
+        modelid = df_out["modelid"].sort_values().unique()
+        if self.use_kaldi_ids:
+            segmentid = modelid
+        else:
+            segmentid = [re.sub(r"[^-]*-", "", s) for s in modelid]
+
+        df_out = pd.DataFrame({"modelid": modelid, "segmentid": segmentid})
+        file_path = self.output_dir / "enrollment.csv"
+        df_out.to_csv(file_path, index=False)
+        enrollments = {"enrollment": file_path}
+
+        return enrollments, trials
+
+    def prepare(self):
+
+        logging.info("getting audio meta-data")
+        df_meta = self._get_metadata()
+        logging.info("getting language estimations")
+        df_lang = self._get_langs_est()
+        rec_dir = self.corpus_dir
+        logging.info("searching audio files in %s", str(rec_dir))
+        rec_files = list(rec_dir.glob("**/*.wav"))
+        if not rec_files:
+            # symlinks? try glob
+            rec_files = [
+                Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
+            ]
+
+        speakers = [f.parents[1].name for f in rec_files]
+        video_ids = [f.parent.name for f in rec_files]
+        if self.cat_videos:
+            lists_cat_dir = self.output_dir / "lists_cat"
+            lists_cat_dir.mkdir(exist_ok=True, parents=True)
+            uniq_video_ids, uniq_video_idx, video_idx = np.unique(
+                video_ids, return_index=True, return_inverse=True
+            )
+            rec_ids = uniq_video_ids
+            speakers = [speakers[i] for i in uniq_video_idx]
+            rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)]
+
+            file_paths = []
+            futures = []
+            logging.info("making video cat lists")
+            logging.info("submitting threats...")
+            with ThreadPoolExecutor(max_workers=self.num_threads) as pool:
+                for i, rec_id in tqdm(enumerate(rec_ids)):
+                    future = pool.submit(
+                        VoxCeleb1DataPrep.make_cat_list,
+                        lists_cat_dir,
+                        rec_id,
+                        rec_files,
+                        video_idx,
+                        i,
+                    )
+                    futures.append(future)
+
+            logging.info("waiting threats...")
+            file_paths = [f.result() for f in tqdm(futures)]
+            video_ids = uniq_video_ids
+
+        else:
+            file_names = [f.with_suffix("").name for f in rec_files]
+            if self.use_kaldi_ids:
+                rec_ids = [
+                    f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names)
+                ]
+            else:
+                rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)]
+
+            file_paths = [str(r) for r in rec_files]
+
+        logging.info("making RecordingSet")
+        recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths})
+        recs = RecordingSet(recs)
+        recs.sort()
+
+        logging.info("getting recording durations")
+        self.get_recording_duration(recs)
+        if self.target_sample_freq:
+            recs["target_sample_freq"] = self.target_sample_freq
+
+        logging.info("making SegmentsSet")
+        segments = pd.DataFrame(
+            {
+                "id": rec_ids,
+                "video_ids": video_ids,
+                "speaker": speakers,
+                "gender": df_meta.loc[speakers, "Gender"],
+                "nationality": df_meta.loc[speakers, "Nationality"],
+                "language_est": [
+                    df_lang.loc[r, "lang"] if r in df_lang.index else "N/A"
+                    for r in rec_ids
+                ],
+                "language_est_conf": [
+                    df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A"
+                    for r in rec_ids
+                ],
+                "duration": recs.loc[rec_ids, "duration"].values,
+            }
+        )
+        segments = SegmentSet(segments)
+        segments.sort()
+
+        logging.info("making speaker info file")
+        uniq_speakers = np.unique(speakers)
+        speakers = pd.DataFrame(
+            {
+                "id": uniq_speakers,
+                "vgg_id": df_meta.loc[uniq_speakers, "VGGFace1 ID"],
+                "gender": df_meta.loc[uniq_speakers, "Gender"],
+                "nationality": df_meta.loc[uniq_speakers, "Nationality"],
+            }
+        )
+        speakers = ClassInfo(speakers)
+
+        logging.info("making language info file")
+        languages = np.unique(df_lang["lang"])
+        languages = ClassInfo(pd.DataFrame({"id": languages}))
+
+        if self.task == "test":
+            enrollments, trials = self.make_trials()
+
+        logging.info("making dataset")
+        dataset = Dataset(
+            segments,
+            classes={"speaker": speakers, "languages": languages},
+            recordings={"recordings": recs},
+            enrollments=enrollments,
+            trials=trials,
+            sparse_trials=False,
+        )
+        logging.info("saving dataset at %s", self.output_dir)
+        dataset.save(self.output_dir)
+        logging.info(
+            "datasets containts %d segments, %d speakers", len(segments), len(speakers)
+        )
diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py
index a1a9f0c3..1a32420f 100644
--- a/hyperion/data_prep/voxceleb2.py
+++ b/hyperion/data_prep/voxceleb2.py
@@ -3,6 +3,7 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 import logging
+import glob
 import re
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
@@ -39,8 +40,7 @@ def __init__(
         target_sample_freq: int,
         num_threads: int = 10,
     ):
-        if cat_videos:
-            use_kaldi_ids = True
+        use_kaldi_ids = True
         super().__init__(
             corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads
         )
@@ -143,6 +143,12 @@ def prepare(self):
         rec_dir = self.corpus_dir / self.subset
         logging.info("searching audio files in %s", str(rec_dir))
         rec_files = list(rec_dir.glob("**/*.m4a"))
+        if not rec_files:
+            # symlinks? try glob
+            rec_files = [
+                Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
+            ]
+
         speakers = [f.parents[1].name for f in rec_files]
         video_ids = [f.parent.name for f in rec_files]
         if self.cat_videos:
@@ -176,7 +182,7 @@ def prepare(self):
             video_ids = uniq_video_ids
 
         else:
-            file_names = [f.name for f in rec_files]
+            file_names = [f.with_suffix("").name for f in rec_files]
             if self.use_kaldi_ids:
                 rec_ids = [
                     f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names)
diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py
index d9d02ed0..60582016 100644
--- a/hyperion/np/classifiers/__init__.py
+++ b/hyperion/np/classifiers/__init__.py
@@ -10,4 +10,4 @@
 from .linear_svmc import LinearSVMC
 from .logistic_regression import LogisticRegression
 from .q_scoring_homo_gbe import QScoringHomoGBE
-from .svmc import GaussianSVMC
+from .svmc import SVMC
diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py
index 5e38494f..8fe67792 100644
--- a/hyperion/torch/layers/global_pool.py
+++ b/hyperion/torch/layers/global_pool.py
@@ -42,8 +42,9 @@ def _standardize_weights(self, x, x_lengths=None, weights=None):
         multiplied by the input data.
         """
         if weights is None:
+            time_dim = self.dim if self.dim >= 0 else x.dim() + self.dim
             return seq_lengths_to_mask(
-                x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=self.dim
+                x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=time_dim
             )
 
         if weights.dim() == x.dim():
@@ -599,7 +600,7 @@ def _standardize_weights(self, x, x_lengths=None, weights=None):
         """standardizes the weights to have shape (batch, max_length)."""
         if weights is None:
             return seq_lengths_to_mask(
-                x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1
+                x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=2
             )
 
         if weights.dim() == x.dim():
@@ -797,7 +798,7 @@ def forward(self, x, x_lengths=None, weights=None):
                 if attn.dtype == torch.half:
                     min_value = -65504
                 else:
-                    min_value = -1e200
+                    min_value = -1e20
                 mask = weights.eq(0)
                 attn = attn.masked_fill(mask, min_value)
 
diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py
index 8556104a..d67785d2 100644
--- a/hyperion/torch/models/xvectors/xvector.py
+++ b/hyperion/torch/models/xvectors/xvector.py
@@ -355,7 +355,7 @@ def forward_hid_feats(
         max_in_length = x.size(-1)
         x = self._pre_enc(x)
         h_enc, x = self.encoder_net.forward_hid_feats(
-            x, return_enc_layers, return_logits=True
+            x, return_enc_layers, return_output=True
         )
         output = {"h_enc": h_enc}
         if not return_logits and return_classif_layers is None:
@@ -363,7 +363,7 @@ def forward_hid_feats(
 
         x, x_lengths = self._post_enc(x, x_lengths, max_in_length)
         p = self.pool_net(x, x_lengths=x_lengths)
-        h_classif, y_pred = self.classif_net.forward_hid_feats(
+        h_classif = self.classif_net.forward_hid_feats(
             p, y, return_classif_layers, return_logits=return_logits
         )
         if return_logits:
@@ -750,7 +750,7 @@ def add_class_args(parser, prefix=None, skip=set()):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py
index 160ee61b..a9ad224e 100644
--- a/hyperion/torch/narchs/audio_feats_mvn.py
+++ b/hyperion/torch/narchs/audio_feats_mvn.py
@@ -32,7 +32,12 @@ def __init__(
         if mvn is not None:
             mvn = MVN.filter_args(**mvn)
             self.mvn_cfg = mvn
-            if mvn["norm_mean"] or mvn["norm_var"]:
+            if (
+                ("norm_mean" in mvn)
+                and mvn["norm_mean"]
+                or ("norm_var" in mvn)
+                and mvn["norm_var"]
+            ):
                 self.mvn = MVN(**mvn)
 
         self.spec_augment = None
@@ -79,7 +84,7 @@ def forward(self, x, x_lengths=None):
         if self.trans:
             f = f.transpose(1, 2).contiguous()
 
-        return f
+        return f, f_lengths
 
     def get_config(self):
         config = {
diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py
index 9f9b280b..e5d90f4f 100644
--- a/hyperion/torch/narchs/classif_head.py
+++ b/hyperion/torch/narchs/classif_head.py
@@ -402,7 +402,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py
index f5ab74d5..172a3d70 100644
--- a/hyperion/torch/narchs/dc1d_decoder.py
+++ b/hyperion/torch/narchs/dc1d_decoder.py
@@ -31,7 +31,7 @@ def __init__(
         conv_strides=2,
         conv_dilations=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         use_norm=True,
@@ -389,7 +389,7 @@ def add_class_args(parser, prefix=None, head_channels=False):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py
index 0c331a5e..6cf7f4ca 100644
--- a/hyperion/torch/narchs/dc1d_encoder.py
+++ b/hyperion/torch/narchs/dc1d_encoder.py
@@ -28,7 +28,7 @@ def __init__(
         conv_strides=2,
         conv_dilations=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         use_norm=True,
@@ -362,7 +362,7 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py
index 4106cbfd..68679e0b 100644
--- a/hyperion/torch/narchs/dc2d_decoder.py
+++ b/hyperion/torch/narchs/dc2d_decoder.py
@@ -31,7 +31,7 @@ def __init__(
         conv_strides=2,
         conv_dilations=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         use_norm=True,
@@ -410,7 +410,7 @@ def add_class_args(parser, prefix=None, head_channels=False):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py
index ce7b9677..bc7e4b33 100644
--- a/hyperion/torch/narchs/dc2d_encoder.py
+++ b/hyperion/torch/narchs/dc2d_encoder.py
@@ -29,7 +29,7 @@ def __init__(
         conv_strides=2,
         conv_dilations=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         use_norm=True,
@@ -367,7 +367,7 @@ def add_class_args(parser, prefix=None, head_channels=False):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py
index cdbf1940..a47f304e 100644
--- a/hyperion/torch/narchs/fcnet.py
+++ b/hyperion/torch/narchs/fcnet.py
@@ -125,7 +125,7 @@ def __init__(
         in_units,
         hid_units,
         out_units=0,
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_act=None,
         dropout_rate=0,
         norm_layer=None,
diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py
index 858cf4ea..5d3b9793 100644
--- a/hyperion/torch/narchs/resnet.py
+++ b/hyperion/torch/narchs/resnet.py
@@ -10,10 +10,16 @@
 import torch.nn as nn
 from torch.nn import BatchNorm1d, Conv1d, Linear
 
-from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock,
-                            ResNetBasicBlock, ResNetBNBlock,
-                            ResNetEndpointBlock, ResNetInputBlock,
-                            SEResNetBasicBlock, SEResNetBNBlock)
+from ..layer_blocks import (
+    Res2NetBasicBlock,
+    Res2NetBNBlock,
+    ResNetBasicBlock,
+    ResNetBNBlock,
+    ResNetEndpointBlock,
+    ResNetInputBlock,
+    SEResNetBasicBlock,
+    SEResNetBNBlock,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import NormLayer2dFactory as NLF
 from ..utils import scale_seq_lengths, seq_lengths_to_mask
@@ -69,7 +75,7 @@ def __init__(
         conv_channels=64,
         base_channels=64,
         out_units=0,
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_act=None,
         in_kernel_size=7,
         in_stride=2,
diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py
index 0c577174..9332724f 100644
--- a/hyperion/torch/narchs/resnet1d_decoder.py
+++ b/hyperion/torch/narchs/resnet1d_decoder.py
@@ -9,9 +9,13 @@
 import torch
 import torch.nn as nn
 
-from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock,
-                            ResNet1dBNDecBlock, SEResNet1dBasicDecBlock,
-                            SEResNet1dBNDecBlock)
+from ..layer_blocks import (
+    DC1dDecBlock,
+    ResNet1dBasicDecBlock,
+    ResNet1dBNDecBlock,
+    SEResNet1dBasicDecBlock,
+    SEResNet1dBNDecBlock,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import ICNR1d
 from ..layers import NormLayer1dFactory as NLF
@@ -34,7 +38,7 @@ def __init__(
         resb_dilations=1,
         resb_groups=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         se_r=16,
@@ -450,7 +454,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py
index 5bdad186..97b244f3 100644
--- a/hyperion/torch/narchs/resnet1d_encoder.py
+++ b/hyperion/torch/narchs/resnet1d_encoder.py
@@ -12,10 +12,16 @@
 import torch
 import torch.nn as nn
 
-from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock,
-                            Res2Net1dBNBlock, ResNet1dBasicBlock,
-                            ResNet1dBNBlock, ResNet1dEndpoint,
-                            SEResNet1dBasicBlock, SEResNet1dBNBlock)
+from ..layer_blocks import (
+    DC1dEncBlock,
+    Res2Net1dBasicBlock,
+    Res2Net1dBNBlock,
+    ResNet1dBasicBlock,
+    ResNet1dBNBlock,
+    ResNet1dEndpoint,
+    SEResNet1dBasicBlock,
+    SEResNet1dBNBlock,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import NormLayer1dFactory as NLF
 from ..utils import seq_lengths_to_mask
@@ -37,7 +43,7 @@ def __init__(
         resb_dilations=1,
         resb_groups=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         drop_connect_rate=0,
@@ -472,7 +478,7 @@ def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False)
         if self.head_channels > 0:
             x = self.head_block(x)
 
-        return x
+        return h, x
 
     def get_config(self):
 
@@ -675,7 +681,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py
index 426b37f5..0afa1acc 100644
--- a/hyperion/torch/narchs/resnet2d_decoder.py
+++ b/hyperion/torch/narchs/resnet2d_decoder.py
@@ -10,9 +10,13 @@
 import torch
 import torch.nn as nn
 
-from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock,
-                            ResNet2dBNDecBlock, SEResNet2dBasicDecBlock,
-                            SEResNet2dBNDecBlock)
+from ..layer_blocks import (
+    DC2dDecBlock,
+    ResNet2dBasicDecBlock,
+    ResNet2dBNDecBlock,
+    SEResNet2dBasicDecBlock,
+    SEResNet2dBNDecBlock,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import ICNR2d
 from ..layers import NormLayer2dFactory as NLF
@@ -35,7 +39,7 @@ def __init__(
         resb_dilations=1,
         resb_groups=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         se_r=16,
@@ -457,7 +461,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py
index 84e6599e..a7fd047e 100644
--- a/hyperion/torch/narchs/resnet2d_encoder.py
+++ b/hyperion/torch/narchs/resnet2d_encoder.py
@@ -11,10 +11,15 @@
 import torch
 import torch.nn as nn
 
-from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock,
-                            Res2Net2dBNBlock, ResNet2dBasicBlock,
-                            ResNet2dBNBlock, SEResNet2dBasicBlock,
-                            SEResNet2dBNBlock)
+from ..layer_blocks import (
+    DC2dEncBlock,
+    Res2Net2dBasicBlock,
+    Res2Net2dBNBlock,
+    ResNet2dBasicBlock,
+    ResNet2dBNBlock,
+    SEResNet2dBasicBlock,
+    SEResNet2dBNBlock,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import NormLayer2dFactory as NLF
 from ..utils import seq_lengths_to_mask
@@ -38,7 +43,7 @@ class ResNet2dEncoder(NetArch):
         resb_dilations=1,
         resb_groups=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         se_r=16,
@@ -65,7 +70,7 @@ def __init__(
         resb_dilations=1,
         resb_groups=1,
         head_channels=0,
-        hid_act="relu6",
+        hid_act="relu",
         head_act=None,
         dropout_rate=0,
         se_r=16,
@@ -511,7 +516,7 @@ def add_class_args(parser, prefix=None, skip=set()):
             )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py
index 2d17a6d7..35ed9af0 100644
--- a/hyperion/torch/narchs/resnet_factory.py
+++ b/hyperion/torch/narchs/resnet_factory.py
@@ -146,7 +146,7 @@ def create(
         conv_channels=64,
         base_channels=64,
         out_units=0,
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_act=None,
         in_kernel_size=7,
         in_stride=2,
@@ -341,7 +341,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py
index 117c0733..4349dbe1 100644
--- a/hyperion/torch/narchs/spinenet.py
+++ b/hyperion/torch/narchs/spinenet.py
@@ -11,9 +11,17 @@
 import torch.nn as nn
 from torch.nn import BatchNorm1d, Conv1d, Linear
 
-from ..layer_blocks import (BlockSpec, Res2NetBasicBlock, Res2NetBNBlock,
-                            ResNetBasicBlock, ResNetBNBlock, ResNetInputBlock,
-                            SpineConv, SpineEndpoints, SpineResample)
+from ..layer_blocks import (
+    BlockSpec,
+    Res2NetBasicBlock,
+    Res2NetBNBlock,
+    ResNetBasicBlock,
+    ResNetBNBlock,
+    ResNetInputBlock,
+    SpineConv,
+    SpineEndpoints,
+    SpineResample,
+)
 from ..layers import ActivationFactory as AF
 from ..layers import NormLayer2dFactory as NLF
 from .net_arch import NetArch
@@ -111,7 +119,7 @@ def __init__(
         do_endpoint_conv=True,
         concat_ax=3,
         upsampling_type="nearest",
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_act=None,
         in_kernel_size=7,
         in_stride=2,
diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py
index 092cbd0e..871b37e9 100644
--- a/hyperion/torch/narchs/spinenet_factory.py
+++ b/hyperion/torch/narchs/spinenet_factory.py
@@ -44,7 +44,7 @@ def create(
         conv_channels=64,
         base_channels=64,
         out_units=0,
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_act=None,
         in_kernel_size=7,
         in_stride=2,
@@ -243,7 +243,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py
index 901cc9d0..77f69b9c 100644
--- a/hyperion/torch/narchs/tdnn_factory.py
+++ b/hyperion/torch/narchs/tdnn_factory.py
@@ -21,7 +21,7 @@ def create(
         kernel_size=3,
         dilation=1,
         dilation_factor=1,
-        hid_act={"name": "relu6", "inplace": True},
+        hid_act={"name": "relu", "inplace": True},
         out_units=0,
         out_act=None,
         dropout_rate=0,
@@ -194,7 +194,7 @@ def add_class_args(parser, prefix=None):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py
index 4468185e..f8b50491 100644
--- a/hyperion/torch/narchs/transformer_encoder_v1.py
+++ b/hyperion/torch/narchs/transformer_encoder_v1.py
@@ -64,7 +64,7 @@ def __init__(
         in_layer_type="conv2d-sub",
         rel_pos_enc=False,
         causal_pos_enc=False,
-        hid_act="relu6",
+        hid_act="relu",
         norm_before=True,
         concat_after=False,
         padding_idx=-1,
@@ -408,7 +408,7 @@ def add_class_args(parser, prefix=None, in_feats=False):
         )
 
         try:
-            parser.add_argument("--hid-act", default="relu6", help="hidden activation")
+            parser.add_argument("--hid-act", default="relu", help="hidden activation")
         except:
             pass
 
diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py
index 9541d7b0..52474baa 100644
--- a/hyperion/torch/trainers/xvector_trainer_from_wav.py
+++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py
@@ -109,10 +109,10 @@ def train_epoch(self, data_loader):
             input_data, target = tensors_subset(data, batch_keys, self.device)
             batch_size = input_data.size(0)
             with torch.no_grad():
-                feats = self.feat_extractor(input_data)
+                feats, feats_lengths = self.feat_extractor(input_data)
 
             with amp.autocast(enabled=self.use_amp):
-                output = self.model(feats, y=target)
+                output = self.model(feats, feats_lengths, y=target)
                 loss = self.loss(output, target).mean() / self.grad_acc_steps
 
             if self.use_amp:
@@ -162,9 +162,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
                 input_data, target = tensors_subset(data, batch_keys, self.device)
                 batch_size = input_data.size(0)
 
-                feats = self.feat_extractor(input_data)
+                feats, feats_lengths = self.feat_extractor(input_data)
                 with amp.autocast(enabled=self.use_amp):
-                    output = self.model(feats)
+                    output = self.model(feats, feats_lengths)
                     loss = self.loss(output, target)
 
                 batch_metrics["loss"] = loss.mean().item()
diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py
index fb93b439..934b4b90 100644
--- a/hyperion/torch/utils/masking.py
+++ b/hyperion/torch/utils/masking.py
@@ -17,9 +17,7 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None):
     if max_in_length == max_out_length:
         return lengths
 
-    return torch.div(lengths * max_out_length,
-                     max_in_length,
-                     rounding_mode="floor")
+    return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor")
 
 
 def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1):
@@ -29,7 +27,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1):
       lengths: sequence lengths with shape=(batch,). If None, it returns None
       max_length: maximum length of the sequence.
       dtype: dtype for the mask.
-      time_dim: dimension corresponding to time in the mask. This will
+      time_dim: dimension > 0 corresponding to time in the mask. This will
                 return a view of the mask which will adapt to the shape
                 of the tensor where we want to apply the mask.
                 This has to be a positive integer.
@@ -40,6 +38,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1):
     if lengths is None:
         return None
 
+    assert time_dim > 0
     assert lengths.dim() == 1
 
     if max_length is None:
diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py
index db035987..51b476aa 100644
--- a/hyperion/utils/__init__.py
+++ b/hyperion/utils/__init__.py
@@ -5,6 +5,7 @@
 
 from .class_info import ClassInfo
 from .dataset import Dataset
+from .enrollment_map import EnrollmentMap
 from .feature_set import FeatureSet
 from .hyp_dataclass import HypDataClass
 from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
@@ -12,6 +13,7 @@
 from .recording_set import RecordingSet
 from .rttm import RTTM
 from .scp_list import SCPList
+
 # from .ext_segment_list import ExtSegmentList
 from .segment_list import SegmentList
 from .segment_set import SegmentSet
diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py
index efb7c114..e6c9e861 100644
--- a/hyperion/utils/dataset.py
+++ b/hyperion/utils/dataset.py
@@ -4,7 +4,7 @@
 """
 
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, Optional, Union
 
 import yaml
 
@@ -13,41 +13,184 @@
 from .misc import PathLike
 from .recording_set import RecordingSet
 from .segment_set import SegmentSet
+from .enrollment_map import EnrollmentMap
+from .trial_key import TrialKey
+from .trial_ndx import TrialNdx
+from .sparse_trial_key import SparseTrialKey
 
 
 class Dataset:
     """ Class that contains all objects 
         (segments, recordings, features, class_infos) that 
         conform a dataset
+
+        Attributes:
+          segments:     SegmentSet object or path to it. 
+          classes:      Dictionary of ClassInfo objects or paths to then 
+          recordings:   Dictionary of RecordingSet objects or paths to then 
+          features:     Dictionary of FeatureSet objects or paths to then 
+          enrollments:  Dictionary of EnrollmentMap objects or paths to then 
+          trials:       Dictionary of TrialKey/TrialNdx/SparseTrialKey objects 
+            or paths to then 
+          sparse_trials: load trial keys using the SparseTrialKey class instead 
+              of TrialKey class.
+          table_sep:    Column separator when reading/writting tables
+
     """
 
     def __init__(
         self,
-        segments: SegmentSet,
-        classes: Optional[Dict[str, ClassInfo]] = None,
-        recordings: Optional[Dict[str, RecordingSet]] = None,
-        features: Optional[Dict[str, FeatureSet]] = None,
+        segments: Union[SegmentSet, PathLike],
+        classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None,
+        recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None,
+        features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None,
+        enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None,
+        trials: Optional[
+            Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]]
+        ] = None,
+        sparse_trials: bool = False,
+        table_sep: Optional[str] = None,
     ):
-        self._segments = segments
-        self._classes = classes
-        self._recordings = recordings
-        self._features = features
 
-    @property
-    def segments(self):
+        if isinstance(segments, SegmentSet):
+            self._segments = segments
+            self._segments_path = None
+        else:
+            assert isinstance(segments, (str, Path))
+            self._segments = None
+            self._segments_path = Path(segments)
+
+        self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo)
+
+        self._recordings, self._recordings_paths = self._parse_dict_args(
+            recordings, RecordingSet
+        )
+
+        self._features, self._features_paths = self._parse_dict_args(
+            features, FeatureSet
+        )
+        self._enrollments, self._enrollments_paths = self._parse_dict_args(
+            enrollments, EnrollmentMap,
+        )
+        self._trials, self._trials_paths = self._parse_dict_args(
+            trials, (TrialKey, TrialNdx, SparseTrialKey),
+        )
+
+        self.sparse_trials = sparse_trials
+        self.table_sep = table_sep
+
+    def _parse_dict_args(self, data, types):
+        if data is None:
+            return None, None
+
+        assert isinstance(data, dict)
+        objects = {k: (v if isinstance(v, types) else None) for k, v in data.items()}
+        paths = {
+            k: (v if isinstance(v, (str, Path)) else None) for k, v in data.items()
+        }
+
+        return objects, paths
+
+    def segments(self, keep_loaded: bool = True):
+        if self._segments is None:
+            assert self._segments_path is not None
+            segments = SegmentSet.load(self.segments_path, sep=self.table_sep)
+            if keep_loaded:
+                self._segments = segments
+            return segments
+
         return self._segments
 
-    @property
-    def recordings(self):
-        return self._recordings
+    def recordings_value(self, key: str, keep_loaded: bool = True):
+        if self._recordings[key] is None:
+            assert self._recordings_paths[key] is not None
+            recordings = RecordingSet.load(
+                self._recordings_paths[key], sep=self.table_sep
+            )
+            if keep_loaded:
+                self._recordings[key] = recordings
+
+        return self._recordings[key]
 
-    @property
-    def features(self):
-        return self._features
+    def features_value(self, key: str, keep_loaded: bool = True):
+        if self._features[key] is None:
+            assert self._features_paths[key] is not None
+            features = FeatureSet.load(self._features_paths[key], sep=self.table_sep)
+            if keep_loaded:
+                self._features[key] = features
+
+        return self._features[key]
+
+    def classes_value(self, key: str, keep_loaded: bool = True):
+        if self._classes[key] is None:
+            assert self._classes_paths[key] is not None
+            classes = ClassInfo.load(self._classes_paths[key], self.table_sep)
+            if keep_loaded:
+                self._classes[key] = classes
+
+        return self._classes[key]
+
+    def enrollments_value(self, key: str, keep_loaded: bool = True):
+        if self._enrollments[key] is None:
+            assert self._enrollments_paths[key] is not None
+            enrollments = EnrollmentMap.load(
+                self._enrollments_paths[key], sep=self.table_sep
+            )
+            if keep_loaded:
+                self._enrollments[key] = enrollments
+
+        return self._enrollments[key]
+
+    def trials_value(self, key: str, keep_loaded: bool = True):
+        if self._trials[key] is None:
+            assert self._trials_paths[key] is not None
+            try:
+                if self.sparse_trials:
+                    trials = SparseTrialKey.load(self._trials_paths[key])
+                else:
+                    trials = TrialKey.load(self._trials_paths[key])
+            except:
+                trials = TrialNdx.load(self._trials_paths[key])
+
+            if keep_loaded:
+                self._trials[key] = trials
+
+        return self._trials[key]
+
+    def recordings(self, keep_loaded: bool = True):
+        if self._recordings is None:
+            yield from ()
+        else:
+            for key in self._recordings.keys():
+                yield key, self.recordings_value(key, keep_loaded)
+
+    def features(self, keep_loaded: bool = True):
+        if self._features is None:
+            yield from ()
+        else:
+            for key in self._features.keys():
+                yield key, self.features_value(key, keep_loaded)
+
+    def classes(self, keep_loaded: bool = True):
+        if self._classes is None:
+            yield from ()
+        else:
+            for key in self._classes.keys():
+                yield key, self.classes_value(key, keep_loaded)
+
+    def enrollments(self, keep_loaded: bool = True):
+        if self._enrollments is None:
+            yield from ()
+        else:
+            for key in self._enrollments.keys():
+                yield key, self.enrollments_value(key, keep_loaded)
 
-    @property
-    def classes(self):
-        return self._classes
+    def trials(self, keep_loaded: bool = True):
+        if self._trials is None:
+            yield from ()
+        else:
+            for key in self._trials.keys():
+                yield key, self.trials_value(key, keep_loaded)
 
     @staticmethod
     def resolve_dataset_path(dataset_path):
@@ -69,64 +212,128 @@ def resolve_file_path(dataset_dir, file_path):
 
         return dataset_dir / file_path
 
-    def save(self, dataset_path: PathLike):
+    def save(
+        self,
+        dataset_path: PathLike,
+        update_paths: bool = True,
+        table_sep: Optional[str] = None,
+    ):
         """Saves all the dataset objects.
 
         Args:
-         dataset_path: str/Path indicating directory 
-          to save the dataset or .yaml file to save 
-          the dataset info.
+          dataset_path: str/Path indicating directory 
+            to save the dataset or .yaml file to save 
+            the dataset info.
+          update_paths: whether to update the file_paths in the 
+            data structures in the DateSet object
 
         """
+        table_sep = self.table_sep if table_sep is None else table_sep
+        if update_paths:
+            self.table_sep = table_sep
+
+        table_ext = ".tsv" if table_sep == "\t" else ".csv"
         dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
         dataset = {}
-        if self.segments is not None:
-            file_name = "segments.csv"
-            dataset["segments"] = file_name
-            file_path = dataset_dir / file_name
-            self.segments.save(file_path)
+        file_name = f"segments{table_ext}"
+        dataset["segments"] = file_name
+        file_path = dataset_dir / file_name
+        self.segments().save(file_path, sep=table_sep)
+        if update_paths:
+            self._segments_path = file_path
 
-        if self.recordings is not None:
-            file_names = {}
-            for k, v in self.recordings.items():
-                file_name = k + ".csv"
-                file_names[k] = file_name
-                file_path = dataset_dir / file_name
-                v.save(file_path)
+        file_names = {}
+        for k, v in self.recordings():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            v.save(file_path, sep=table_sep)
+            if update_paths:
+                self._recordings_paths[k] = file_path
 
+        if file_names:
             dataset["recordings"] = file_names
 
-        if self.features is not None:
-            file_names = {}
-            for k, v in self.features.items():
-                file_name = k + ".csv"
-                file_names[k] = file_name
-                file_path = dataset_dir / file_name
-                v.save(file_path)
+        file_names = {}
+        for k, v in self.features():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            v.save(file_path, sep=table_sep)
+            if update_paths:
+                self._features_paths[k] = file_path
 
+        if file_names:
             dataset["features"] = file_names
 
-        if self.classes is not None:
-            file_names = {}
-            for k, v in self.classes.items():
-                file_name = k + ".csv"
-                file_names[k] = file_name
-                file_path = dataset_dir / file_name
-                v.save(file_path)
+        file_names = {}
+        for k, v in self.classes():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            v.save(file_path, sep=table_sep)
+            if update_paths:
+                self._classes_paths[k] = file_path
 
+        if file_names:
             dataset["classes"] = file_names
 
+        file_names = {}
+        for k, v in self.enrollments():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            v.save(file_path, sep=table_sep)
+            if update_paths:
+                self._enrollments_paths[k] = file_path
+
+        if file_names:
+            dataset["enrollments"] = file_names
+
+        file_names = {}
+        for k, v in self.trials():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            v.save(file_path)
+            if update_paths:
+                self._trials_paths[k] = file_path
+
+        if file_names:
+            dataset["trials"] = file_names
+
         with open(dataset_file, "w") as f:
             yaml.dump(dataset, f)
 
+    def update_from_disk(self):
+        self.segments()
+        for k, v in self.recordings():
+            pass
+
+        for k, v in self.features():
+            pass
+
+        for k, v in self.classes():
+            pass
+
+        for k, v in self.enrollments():
+            pass
+
+        for k, v in self.trials():
+            pass
+
     @classmethod
-    def load(cls, dataset_path: PathLike):
+    def load(
+        cls, dataset_path: PathLike, lazy: bool = True, sparse_trials: bool = False
+    ):
         """Loads all the dataset objects.
 
         Args:
          dataset_path: str/Path indicating directory 
           to save the dataset or .yaml file to save 
           the dataset info.
+         lazy: load data structures lazily when they are needed.
+         sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class
 
         """
         dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
@@ -134,27 +341,79 @@ def load(cls, dataset_path: PathLike):
             dataset = yaml.safe_load(f)
 
         assert "segments" in dataset
-        segments = SegmentSet.load(
-            Dataset.resolve_file_path(dataset_dir, dataset["segments"])
-        )
+        segments = Dataset.resolve_file_path(dataset_dir, dataset["segments"])
         classes = None
         recordings = None
         features = None
+        enrollments = None
+        trials = None
         if "classes" in dataset:
             classes = {}
             for k, v in dataset["classes"]:
-                classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v))
+                classes[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "recordings" in dataset:
             recordings = {}
             for k, v in dataset["recordings"]:
-                recordings[k] = RecordingSet.load(
-                    Dataset.resolve_file_path(dataset_dir, v)
-                )
+                recordings[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "features" in dataset:
             features = {}
             for k, v in dataset["features"]:
-                features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v))
+                features[k] = Dataset.resolve_file_path(dataset_dir, v)
+
+        if "enrollments" in dataset:
+            enrollments = {}
+            for k, v in dataset["enrollments"]:
+                enrollments[k] = Dataset.resolve_file_path(dataset_dir, v)
+
+        if "trials" in dataset:
+            trials = {}
+            for k, v in dataset["trials"]:
+                trials[k] = Dataset.resolve_file_path(dataset_dir, v)
+
+        dataset = cls(
+            segments,
+            classes,
+            recordings,
+            features,
+            enrollments,
+            trials,
+            sparse_trials=sparse_trials,
+        )
+        if not lazy:
+            dataset.update_from_disk()
+
+        return dataset
+
+        # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
+        # with open(dataset_file, "w") as f:
+        #     dataset = yaml.safe_load(f)
+
+        # assert "segments" in dataset
+        # segments = SegmentSet.load(
+        #     Dataset.resolve_file_path(dataset_dir, dataset["segments"])
+        # )
+        # classes = None
+        # recordings = None
+        # features = None
+        # if "classes" in dataset:
+        #     classes = {}
+        #     for k, v in dataset["classes"]:
+        #         classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v))
+
+        # if "recordings" in dataset:
+        #     recordings = {}
+        #     for k, v in dataset["recordings"]:
+        #         recordings[k] = RecordingSet.load(
+        #             Dataset.resolve_file_path(dataset_dir, v)
+        #         )
+
+        # if "features" in dataset:
+        #     features = {}
+        #     for k, v in dataset["features"]:
+        #         features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v))
 
-        return cls(segments, classes, recordings, features)
+        # dataset = cls(segments, classes, recordings, features)
+        # if not lazy:
+        #     dataset.update_from_disk()
diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py
new file mode 100644
index 00000000..024e5b74
--- /dev/null
+++ b/hyperion/utils/enrollment_map.py
@@ -0,0 +1,86 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+import logging
+import re
+from collections import OrderedDict
+from copy import deepcopy
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from .list_utils import split_list, split_list_group_by_key
+from .info_table import InfoTable
+
+
+class EnrollmentMap(InfoTable):
+    """Class to store the mapping between enrollment id
+       and segmentids
+    """
+
+    def __init__(self, df):
+        if "modelid" in df:
+            df.rename(columns={"modelid": "id"}, inplace=True)
+        super().__init__(df)
+
+    def split(self, idx, num_parts):
+        """Splits the mapping into num_parts and return part idx.
+
+        Args:
+          idx: Part to return from 1 to num_parts.
+          num_parts: Number of parts to split the list.
+          group_by: All the lines with the same value in column
+                          groub_by_field go to the same part
+
+        Returns:
+          Sub InfoTable object
+        """
+        _, idx1 = split_list_group_by_key(self.df["id"], idx, num_parts)
+
+        df = self.df.iloc[idx1]
+        return EnrollmentMap(df)
+
+    def save(self, file_path, sep=None, nist_compatible=True):
+        if nist_compatible:
+            # For compatibility with NIST SRE files the index column "id"
+            # is saved as modelid
+            self.df.rename(columns={"id": "modelid"}, inplace=True)
+
+        super().save(file_path, sep)
+        if nist_compatible:
+            self.df.rename(columns={"modelid": "id"}, inplace=True)
+
+    @classmethod
+    def load(cls, file_path, sep=None):
+        """Loads EnrollmentMap from file.
+
+        Args:
+          file_path: File to read the list.
+          sep: Separator between the key and file_path in the text file.
+          dtype: Dictionary with the dtypes of each column.
+          name: name for the data to be loaded
+        Returns:
+          EnrollmentMap object
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if ext in ["", ".scp"]:
+            # if no extension we load as kaldi utt2spk file
+            df = pd.read_csv(
+                file_path,
+                sep=" ",
+                header=None,
+                names=["segmentid", "modelid"],
+                dtype={"segmentid": np.str, "modelid": np.str},
+            )
+            df = df[["modelid", "segmentid"]]
+        else:
+            if sep is None:
+                sep = "\t" if ".tsv" in ext else ","
+
+            df = pd.read_csv(file_path, sep=sep)
+
+        return cls(df)
diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py
index 5a4f27d2..6bcd4aca 100644
--- a/hyperion/utils/info_table.py
+++ b/hyperion/utils/info_table.py
@@ -119,7 +119,7 @@ def from_dict(cls, df_dict):
 
     @classmethod
     def load(cls, file_path, sep=None, name="class_id"):
-        """Loads utt2info list from text file.
+        """Loads table from file.
 
         Args:
           file_path: File to read the list.
@@ -127,7 +127,7 @@ def load(cls, file_path, sep=None, name="class_id"):
           dtype: Dictionary with the dtypes of each column.
           name: name for the data to be loaded
         Returns:
-          Utt2Info object
+          InfoTable object
         """
         file_path = Path(file_path)
         ext = file_path.suffix
@@ -156,7 +156,7 @@ def sort(self, column="id", ascending=True):
             self.df.sort_values(by=column, inplace=True, ascending=ascending)
 
     def split(self, idx, num_parts, group_by=None):
-        """Splits SCPList into num_parts and return part idx.
+        """Splits the table into num_parts and return part idx.
 
         Args:
           idx: Part to return from 1 to num_parts.
@@ -177,13 +177,13 @@ def split(self, idx, num_parts, group_by=None):
 
     @classmethod
     def merge(cls, tables):
-        """Merges several Utt2Info tables.
+        """Merges several tables.
 
         Args:
-          info_lists: List of Utt2Info
+          info_lists: List of InfoTables
 
         Returns:
-          Utt2Info object concatenation the info_lists.
+          InfoTable object concatenation the info_lists.
         """
         df_list = [table.df for table in tables]
         df = pd.concat(df_list)
diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py
index d51edc34..1852d25d 100644
--- a/hyperion/utils/segment_set.py
+++ b/hyperion/utils/segment_set.py
@@ -7,6 +7,10 @@
 
 
 class SegmentSet(InfoTable):
+    """Class to store information about a speech segment
+       Internally, it uses a pandas table.
+    """
+
     def __init__(self, df):
         super().__init__(df)
         if "start" in df and "recording_id" not in df:
diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py
index 5afc72a0..1bc321a7 100644
--- a/hyperion/utils/sparse_trial_key.py
+++ b/hyperion/utils/sparse_trial_key.py
@@ -5,8 +5,10 @@
 
 import copy
 import os.path as path
+from pathlib import Path
 
 import numpy as np
+import pandas as pd
 import scipy.sparse as sparse
 
 from .list_utils import *
@@ -79,6 +81,28 @@ def save_txt(self, file_path):
             for r, c in zip(non.row, non.col):
                 f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c]))
 
+    def save_table(self, file_path, sep=None):
+        """Saves object to txt file.
+
+        Args:
+          file_path: File to write the list.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"modelid{sep}segmentid{sep}targettype\n")
+            self.tar.eliminate_zeros()
+            self.non.eliminate_zeros()
+            tar = self.tar.tocoo()
+            for r, c in zip(tar.row, tar.col):
+                f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}target\n")
+            non = self.non.tocoo()
+            for r, c in zip(non.row, non.col):
+                f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}nontarget\n")
+
     @classmethod
     def load_h5(cls, file_path):
         raise NotImplementedError()
@@ -113,6 +137,40 @@ def load_txt(cls, file_path):
                 non[item[0], item[1]] = True
         return cls(model_set, seg_set, tar.tocsr(), non.tocsr())
 
+    @classmethod
+    def load_table(cls, file_path, sep=None):
+        """Loads object from txt file
+
+        Args:
+          file_path: File to read the list.
+
+        Returns:
+          TrialKey object.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        df = pd.read_csv(file_path, sep=sep)
+        models = df["modelid"].values
+        segments = df["segmentid"].values
+        is_tar = (df["targettype"] == "target").values
+        model_set, _, model_idx = np.unique(
+            models, return_index=True, return_inverse=True
+        )
+        seg_set, _, seg_idx = np.unique(
+            segments, return_index=True, return_inverse=True
+        )
+        tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool")
+        non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool")
+        for item in zip(model_idx, seg_idx, is_tar):
+            if item[2]:
+                tar[item[0], item[1]] = True
+            else:
+                non[item[0], item[1]] = True
+        return cls(model_set, seg_set, tar.tocsr(), non.tocsr())
+
     @classmethod
     def merge(cls, key_list):
         raise NotImplementedError()
diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py
index 9552d7c0..4a99461b 100644
--- a/hyperion/utils/trial_key.py
+++ b/hyperion/utils/trial_key.py
@@ -5,9 +5,11 @@
 
 import copy
 import os.path as path
+from pathlib import Path
 
 import h5py
 import numpy as np
+import pandas as pd
 
 from .list_utils import *
 from .trial_ndx import TrialNdx
@@ -82,18 +84,20 @@ def sort(self):
         if self.trial_cond is not None:
             self.trial_cond = self.trial_cond[:, ix]
 
-    def save(self, file_path):
+    def save(self, file_path, sep=None):
         """Saves object to txt/h5 file.
 
         Args:
           file_path: File to write the list.
         """
-
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if ext in (".h5", ".hdf5"):
             self.save_h5(file_path)
-        else:
+        elif ext in ("", ".txt"):
             self.save_txt(file_path)
+        else:
+            self.save_table(file_path, sep)
 
     def save_h5(self, file_path):
         """Saves object to h5 file.
@@ -132,20 +136,40 @@ def save_txt(self, file_path):
           file_path: File to write the list.
         """
         with open(file_path, "w") as f:
-            idx = (self.tar.T == True).nonzero()
+            idx = (self.tar.T).nonzero()
             for item in zip(idx[0], idx[1]):
                 f.write(
                     "%s %s target\n" % (self.model_set[item[1]], self.seg_set[item[0]])
                 )
-            idx = (self.non.T == True).nonzero()
+            idx = (self.non.T).nonzero()
             for item in zip(idx[0], idx[1]):
                 f.write(
                     "%s %s nontarget\n"
                     % (self.model_set[item[1]], self.seg_set[item[0]])
                 )
 
+    def save_table(self, file_path, sep=None):
+        """Saves object to txt file.
+
+        Args:
+          file_path: File to write the list.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"modelid{sep}segmentid{sep}targettype\n")
+            I, J = np.logical_or(self.tar, self.non).nonzero()
+            for i, j in zip(I, J):
+                target_type = "target" if self.tar[i, j] else "nontarget"
+                f.write(
+                    f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{target_type}\n"
+                )
+
     @classmethod
-    def load(cls, file_path):
+    def load(cls, file_path, sep=None):
         """Loads object from txt/h5 file
 
         Args:
@@ -154,11 +178,13 @@ def load(cls, file_path):
         Returns:
           TrialKey object.
         """
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        _, file_ext = path.splitext(file_path)
+        if file_ext in (".h5", ".hdf5"):
             return cls.load_h5(file_path)
-        else:
+        elif file_ext in ("", ".txt"):
             return cls.load_txt(file_path)
+        else:
+            return cls.load_table(file_path, sep)
 
     @classmethod
     def load_h5(cls, file_path):
@@ -240,6 +266,40 @@ def load_txt(cls, file_path):
                 non[item[0], item[1]] = True
         return cls(model_set, seg_set, tar, non)
 
+    @classmethod
+    def load_table(cls, file_path, sep=None):
+        """Loads object from txt file
+
+        Args:
+          file_path: File to read the list.
+
+        Returns:
+          TrialKey object.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        df = pd.read_csv(file_path, sep=sep)
+        models = df["modelid"].values
+        segments = df["segmentid"].values
+        is_tar = (df["targettype"] == "target").values
+        model_set, _, model_idx = np.unique(
+            models, return_index=True, return_inverse=True
+        )
+        seg_set, _, seg_idx = np.unique(
+            segments, return_index=True, return_inverse=True
+        )
+        tar = np.zeros((len(model_set), len(seg_set)), dtype="bool")
+        non = np.zeros((len(model_set), len(seg_set)), dtype="bool")
+        for i, j, target_type in zip(model_idx, seg_idx, is_tar):
+            if target_type:
+                tar[i, j] = True
+            else:
+                non[i, j] = True
+        return cls(model_set, seg_set, tar, non)
+
     @classmethod
     def merge(cls, key_list):
         """Merges several key objects.

From df8a24fc651240ab2fe193f900f624282e8fa9e0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-122-76.ec2.internal>
Date: Sat, 20 May 2023 22:05:30 +0000
Subject: [PATCH 29/89] add initialize model for joint-training and film
 training

---
 ...lize_model.py => initailize_film_model.py} |  1 +
 .../v1/local/initailize_lid_model.py          | 49 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 rename egs/commonvoice/v1/local/{initailize_model.py => initailize_film_model.py} (99%)
 create mode 100644 egs/commonvoice/v1/local/initailize_lid_model.py

diff --git a/egs/commonvoice/v1/local/initailize_model.py b/egs/commonvoice/v1/local/initailize_film_model.py
similarity index 99%
rename from egs/commonvoice/v1/local/initailize_model.py
rename to egs/commonvoice/v1/local/initailize_film_model.py
index 7ae9db8e..2b15c236 100644
--- a/egs/commonvoice/v1/local/initailize_model.py
+++ b/egs/commonvoice/v1/local/initailize_film_model.py
@@ -1,4 +1,5 @@
 import torch
+import sys
 
 # arguments example
 # pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth'
diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py
new file mode 100644
index 00000000..9a2c1a06
--- /dev/null
+++ b/egs/commonvoice/v1/local/initailize_lid_model.py
@@ -0,0 +1,49 @@
+import torch
+import sys
+# arguments example
+# ASR_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth'
+# LID_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth"
+# output_model = "model_initialized.pth"
+
+# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s1/model_ep0003.pth  /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s3/model_ep0001.pth 
+
+ASR_model = torch.load(sys.argv[1])
+LID_model = torch.load(sys.argv[2])
+
+output_model = sys.argv[3]
+
+
+def copy_model_parameters(ASR_model, LID_model):
+    ASR_state_dict = ASR_model["model_state_dict"]
+    LID_state_dict = LID_model["model_state_dict"]
+
+    update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name}
+    # remove feature fuser
+    
+    new_LID_state_dict = LID_state_dict.copy()
+    new_LID_state_dict.update(update_state_dict)
+    
+    LID_model["model_state_dict"] = new_LID_state_dict
+
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in LID_state_dict.items():
+        if torch.all(torch.eq(param, new_LID_state_dict[name])):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+
+    for name, param in ASR_state_dict.items():
+        if name not in changed_parameters:
+            unloaded_parameters.append(name)
+
+    print(f"Unchanged parameters: {unchanged_parameters}")
+    print(f"Unloaded parameters: {unloaded_parameters}")
+    print(f"Changed parameters: {changed_parameters}")
+    LID_model["epoch"] =1
+    torch.save(LID_model, output_model)
+
+
+
+copy_model_parameters(ASR_model, LID_model)
\ No newline at end of file

From 159ff073e14fb0e13101afc8c71de6bb4bffda2e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-7.ec2.internal>
Date: Sun, 21 May 2023 05:46:18 +0000
Subject: [PATCH 30/89] Add transducer and languageid joint training

---
 ..._k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml |  84 +++++++
 ...nfig_pruned_transducer_lid_v1.0_13langs.sh |  48 ++++
 .../finetune_wav2vec2transducer_languageid.py | 123 ++++++++--
 .../hf_wav2rnn_transducer_languageid.py       | 213 ++++++++++--------
 .../hf_wav2vec2rnn_transducer_languageid.py   |  53 ++++-
 .../trainers/transducer_languageid_trainer.py |   3 +-
 6 files changed, 395 insertions(+), 129 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
new file mode 100644
index 00000000..972f7c1c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
@@ -0,0 +1,84 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.0002
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.01
+    data_loader:
+      num_workers: 8
+model:
+  transducer:
+    decoder:
+      prune_range: 15
+      override_dropouts: false
+  languageid:
+    # resnet_enc:
+      # num_classes: 13
+    cos_scale: 32.0
+  
+  loss_weight_transducer: 0.005
+  loss_weight_lid: 1.0
+  lid_length: 3.0
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
new file mode 100644
index 00000000..b4437442
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
@@ -0,0 +1,48 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer_resnet1d
+
+# nnet_s1_transducer_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
+# nnet_s1_transducer_args=""
+
+nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2
+nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name
+nnet_transducer=$nnet_transducer_dir/model_ep0008.pth
+
+nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3
+nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name
+nnet_lid=$nnet_lid_dir/model_ep0003.pth
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_resnet1d.v1.0_13_langs_8000_bpe
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2transducer_languageid.py
index 0628f3da..68d8dacf 100755
--- a/hyperion/bin/finetune_wav2vec2transducer_languageid.py
+++ b/hyperion/bin/finetune_wav2vec2transducer_languageid.py
@@ -37,13 +37,13 @@
 def transducer_language_collate(batch):
     audio = []
     audio_length = []
-    text = []
+    target = []
     language = []
     for record in batch:
         wav = torch.as_tensor(record["x"])
         audio.append(wav)
         audio_length.append(wav.shape[0])
-        text.append(record["text"])
+        target.append(record["text"])
         language.append(record["language"])
     audio = pad_sequence(audio).transpose(0, 1)
     audio_length = torch.as_tensor(audio_length)
@@ -52,21 +52,24 @@ def transducer_language_collate(batch):
     sort_idx = torch.argsort(audio_length, descending=True)
     audio = audio[sort_idx]
     audio_length = audio_length[sort_idx]
-    text = [text[k] for k in sort_idx]
-    text = k2.RaggedTensor(text)
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
     language = [language[k] for k in sort_idx]
     language = torch.as_tensor(language)
 
+    # FiLM: add language ID to the input
     batch = {
         "x": audio,
         "x_lengths": audio_length,
-        "text": text,
-        "languageid": language,
+        "text": target,
+        "language": language,
     }
     return batch
 
 
 
+
 def init_data(partition, rank, num_gpus, **kwargs):
     data_kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**data_kwargs["dataset"])
@@ -97,25 +100,95 @@ def init_data(partition, rank, num_gpus, **kwargs):
     return data_loader
 
 
-def init_model(num_classes, in_model_transducer, in_model_lid, rank, model_class, **kwargs):
+def check_update_parameters(joint_state_dict, new_joint_state_dict, rank):
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in joint_state_dict.items():
+        new_param = new_joint_state_dict[name].to(param.device)
+        if torch.all(torch.eq(param, new_param)):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+    # logging
+    if rank == 0:
+        logging.info("Unchanged parameters: {}".format(unchanged_parameters))
+        logging.info("Changed parameters: {}".format(changed_parameters))
+
+
+def remove_module_from_state_dict(state_dict):
+    new_state_dict = {}
+    for name, param in state_dict.items():
+        if name.startswith("module."):
+            new_state_dict[name[len("module."):]] = param
+        else:
+            new_state_dict[name] = param
+    return new_state_dict
+
+
+def copy_model_parameters(joint_model, wav2transducer_state_dict, wav2lid_state_dict, rank):
+    joint_state_dict = joint_model.state_dict()
+    wav2transducer_state_dict = remove_module_from_state_dict(wav2transducer_state_dict)
+    wav2lid_state_dict = remove_module_from_state_dict(wav2lid_state_dict)  
+
+
+    hf_feats_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name}
+    transducer_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name}
+    languageid_update_state_dict = {name: param for name, param in wav2lid_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name}
+    
+    new_joint_state_dict = joint_state_dict.copy()
+    new_joint_state_dict.update(hf_feats_update_state_dict)
+    new_joint_state_dict.update(transducer_update_state_dict)
+    new_joint_state_dict.update(languageid_update_state_dict)
+    
+    new_joint_state_dict["transducer_fuser"] = wav2transducer_state_dict["feat_fuser"]
+    new_joint_state_dict["languageid_fuser"] = wav2lid_state_dict["feat_fuser"]
+    
+
+    check_update_parameters(joint_state_dict, new_joint_state_dict, rank)
+    joint_model.load_state_dict(new_joint_state_dict)
+
+def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs):
+    # load pretrained models
+    model_wav2transducer = torch.load(in_model_transducer)
+    model_wav2lid = torch.load(in_model_lid) 
+    if rank == 0:
+        logging.info("init joint model")
+        logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"]))
+        logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"]))
+        logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"]))
+        logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"]))
+        logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"]))
+        logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"]))
+
+    # init joint model
+    model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], 
+                        transducer=model_wav2transducer["model_cfg"]["transducer"], 
+                        languageid=model_wav2lid["model_cfg"]["languageid"],
+                        feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"],
+                        feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"],
+                        feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"],
+                        loss_weight_transducer=kwargs["model"]["loss_weight_transducer"],
+                        loss_weight_lid=kwargs["model"]["loss_weight_lid"],
+                        lid_length=kwargs["model"]["lid_length"],
+                        )
+
+    copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank)
+
+
+    # add finetune args
     model_args = model_class.filter_finetune_args(**kwargs["model"])
+
     # model_args = model_class.filter_args(**kwargs["model"])
     if rank == 0:
         logging.info("model network ft args={}".format(model_args))
-    model_wav2transducer = TML.load(in_model_transducer)
-    model_wav2lid = TML.load(in_model_lid)
-    model_args["languageid"]["num_classes"] = num_classes
-    logging.info(model_args)
-    model = model_class(model_wav2transducer.hf_feats, model_wav2transducer.transducer, model_wav2lid.languageid)
+    model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"]
     model.change_config(**model_args)
     if rank == 0:
         logging.info("model={}".format(model))
     return model
 
 
-
-
-
 def train_model(gpu_id, args):
 
     config_logger(args.verbose)
@@ -126,24 +199,24 @@ def train_model(gpu_id, args):
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
 
-    ddp_args = ddp.filter_ddp_args(**kwargs)
-    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
-    kwargs["rank"] = rank
+    # ddp_args = ddp.filter_ddp_args(**kwargs)
+    # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    # kwargs["rank"] = rank
 
-    # # for Debug
-    # rank = 0
-    # kwargs["rank"] = 0
-    # device = "cpu"
-    # world_size=1
+    # for Debug
+    rank = 0
+    kwargs["rank"] = 0
+    device = torch.device("cuda:0")
+    world_size=1
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
-    model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+    model = init_model(**kwargs)
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
-    metrics = {} 
+    metrics = {"acc": CategoricalAccuracy()}
     trainer = Trainer(
         model,
         device=device,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index b9f39de8..90211ec9 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -15,15 +15,18 @@
 from ...torch_model import TorchModel
 from ...utils import remove_silence
 from ..transducer import RNNTransducer, RNNTransducerOutput
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
 
 @dataclass
 class RNNTransducerLanguageIDOutput(HypDataClass):
-    loss: torch.Tensor
-    loss_transducer: torch.Tensor
-    loss_lid: torch.Tensor
-    loss_transducer_simple: Optional[torch.Tensor] = None
-    loss_transducer_pruned: Optional[torch.Tensor] = None
-    h_feats: Optional[List[torch.Tensor]] = None
+    loss: torch.Tensor  # Total loss
+    loss_transducer: torch.Tensor  # Loss from the transducer
+    loss_lid: torch.Tensor  # Loss from the language ID
+    loss_transducer_simple: Optional[torch.Tensor] = None  # Simple loss from the transducer, if available
+    loss_transducer_pruned: Optional[torch.Tensor] = None  # Pruned loss from the transducer, if available
+    h_feats: Optional[List[torch.Tensor]] = None  # Hidden features, if available
+    logits: Optional[torch.Tensor] = None  # Logits from languageid, if available
+
 
 class HFWav2RNNTransducerLanguageID(TorchModel):
     """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor.
@@ -40,61 +43,76 @@ class HFWav2RNNTransducerLanguageID(TorchModel):
 
     def __init__(self,
                  hf_feats: TorchModel,
-                 transducer: TorchModel,
-                 languageid: TorchModel,
-                 transducer_fuser: TorchModel,
-                 languageid_fuser: TorchModel,
+                 transducer: Union[Dict, TorchModel],
+                 languageid: Union[Dict, TorchModel],
                  feat_fusion_start: int = 0,
-                 feat_fusion_method: str = "weighted-avg",
+                 feat_fusion_method_transducer: str = "weighted-avg",
+                 feat_fusion_method_languageid: str = "weighted-avg",
                  loss_weight_transducer: float = 0.005,
-                 loss_weight_lid: float = 1.0,):
+                 loss_weight_lid: float = 1.0,
+                 lid_length: float = 3.0,
+                 ):
 
         super().__init__()
         self.hf_feats = hf_feats
-        # if isinstance(transducer, dict):
-        #     transducer["decoder"]["in_feats"] = hf_feats.hidden_size
-        #     #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
-        #     if "class_name" in transducer:
-        #         del transducer["class_name"]
-
-        #     transducer["encoder"] = None
-        #     transducer = RNNTransducer(**transducer)
-        # else:
-        #     assert isinstance(transducer, RNNTransducer)
-        #     if transducer.encoder is None:
-        #         assert transducer.decoder.in_feats == hf_feats.hidden_size
-        #         #assert transducer.joiner.in_feats == hf_feats.hidden_size
+        if isinstance(transducer, dict):
+            transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+            #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in transducer:
+                del transducer["class_name"]
+
+            transducer["encoder"] = None
+            transducer = RNNTransducer(**transducer)
+        else:
+            assert isinstance(transducer, RNNTransducer)
+            if transducer.encoder is None:
+                assert transducer.decoder.in_feats == hf_feats.hidden_size
+                #assert transducer.joiner.in_feats == hf_feats.hidden_size
+
+        if isinstance(languageid, dict):
+            languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in languageid:
+                del languageid["class_name"]
+            languageid = ResNet1dLanguageID(**languageid)
+        else:
+            assert isinstance(languageid, ResNet1dLanguageID)
+            assert languageid.encoder_net.in_feats == hf_feats.hidden_size
+
 
         self.transducer = transducer
         self.languageid = languageid
-        self.transducer_fuser = transducer_fuser
-        self.languageid_fuser = languageid_fuser
 
         self.feat_fusion_start = feat_fusion_start
-        self.feat_fusion_method = feat_fusion_method
+        self.feat_fusion_method_transducer = feat_fusion_method_transducer
+        self.feat_fusion_method_languageid = feat_fusion_method_languageid
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
+        self.lid_length = lid_length
         self._hf_context = contextlib.nullcontext()
-
-    # def _make_fuser(self, transducer_fuser, languageid_fuser):
-    #     if self.feat_fusion_method == "last":
-    #         self.feat_fuser = None
-    #         return
-
-    #     num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
-    #     layer_dim = self.hf_feats.hidden_size
-    #     if self.feat_fusion_method == "weighted-avg":
-    #         self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
-    #     elif self.feat_fusion_method == "linear":
-    #         self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
-    #         self.feat_fuser.weight.data = torch.ones(1,
-    #                                                  num_layers) / num_layers
-    #     elif self.feat_fusion_method == "cat":
-    #         self.feat_fuser = nn.Linear(num_layers * layer_dim,
-    #                                     layer_dim,
-    #                                     bias=False)
-
-    def _fuse_hid_feats(self, hid_feats):
+        self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer)
+        self.languageid_fuser = self._make_fuser(self.feat_fusion_method_languageid)
+
+    def _make_fuser(self, method):
+        if method == "last":
+            feat_fuser = None
+            return feat_fuser
+        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        layer_dim = self.hf_feats.hidden_size
+        if method == "weighted-avg":
+            feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif method == "linear":
+            feat_fuser = nn.Linear(num_layers, 1, bias=False)
+            feat_fuser.weight.data = torch.ones(1,
+                                                     num_layers) / num_layers
+        elif method == "cat":
+            feat_fuser = nn.Linear(num_layers * layer_dim,
+                                        layer_dim,
+                                        bias=False)
+
+        return feat_fuser
+
+
+    def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser):
         """Fuses the hidden features from the Wav2Vec model.
 
         Args:
@@ -108,25 +126,20 @@ def _fuse_hid_feats(self, hid_feats):
             return hid_feats[0]
 
         hid_feats = hid_feats[self.feat_fusion_start:]
-        if self.feat_fusion_method == "weighted-avg":
+        if feat_fusion_method == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            norm_weights_transducer = nn.functional.softmax(self.transducer_fuser, dim=-1)
-            norm_weights_languageid = nn.functional.softmax(self.languageid_fuser, dim=-1)
-            feats_transducer = torch.sum(hid_feats * norm_weights_transducer, dim=-1)
-            feats_languageid = torch.sum(hid_feats * norm_weights_languageid, dim=-1)
-        elif self.feat_fusion_method == "linear":
+            norm_weights = nn.functional.softmax(feat_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif feat_fusion_method == "linear":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1)
-            feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1)
-        elif self.feat_fusion_method == "cat":
+            feats = feat_fuser(hid_feats).squeeze(dim=-1)
+        elif feat_fusion_method == "cat":
             hid_feats = torch.cat(hid_feats, dim=-1)
-            feats_transducer = self.transducer_fuser(hid_feats)
-            feats_languageid = self.languageid_fuser(hid_feats)
-        elif self.feat_fusion_method == "last":
-            feats_transducer = hid_feats[-1]
-            feats_languageid = hid_feats[-1]
+            feats = feat_fuser(hid_feats)
+        elif feat_fusion_method == "last":
+            feats = hid_feats[-1]
 
-        return feats_transducer, feats_languageid
+        return feats
 
     def forward_feats(self,
                       x,
@@ -135,7 +148,7 @@ def forward_feats(self,
                       chunk_length=0,
                       detach_chunks=False):
         return_hid_states = (False if return_feat_layers is None
-                             and self.feat_fusion_method == "last" else True)
+                             and self.feat_fusion_method_transducer == "last" else True)
         with self._hf_context:
             hf_output = self.hf_feats(
                 x,
@@ -147,7 +160,8 @@ def forward_feats(self,
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
-            feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats)
+            feats_transducer = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_transducer, self.transducer_fuser)
+            feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser)
         else:
             hid_feats = None
             feats_transducer = hf_output["last_hidden_state"]
@@ -166,6 +180,25 @@ def forward_feats(self,
             hid_feats = None
 
         return feats_transducer, feats_languageid, hid_feats, feat_lengths
+            
+    def languageid_chunk(self, feats, lengths):
+        sr = self.hf_feats.get_config()["sample_frequency"]
+        strides = self.hf_feats.get_config()["conv_stride"]
+        
+        total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32))
+
+        chunk_length = int(self.lid_length * sr / total_stride)
+
+        # Check if all samples are longer than chunk_length
+        if any(len < chunk_length for len in lengths):
+            return feats
+
+        start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths]
+
+        chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)])
+
+        return chunks
+
 
     def forward(
         self,
@@ -199,6 +232,8 @@ def forward(
         feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats(
             x, x_lengths, return_feat_layers)
 
+        feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths)
+
         feats_transducer = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
             
         logits = self.languageid(
@@ -221,7 +256,13 @@ def forward(
 
         if return_feat_layers:
             trans_output.h_feats = hid_feats
-        output = RNNTransducerLanguageIDOutput(self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats)
+        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, 
+                                                loss_transducer=trans_output.loss, 
+                                                loss_lid=loss_lid,
+                                                loss_transducer_simple=trans_output.loss_simple, 
+                                                loss_transducer_pruned=trans_output.loss_pruned,
+                                                h_feats=trans_output.h_feats,
+                                                logits=logits if return_logits else None)
         return output
 
     def infer(self,
@@ -255,16 +296,16 @@ def infer(self,
                                   max_sym_per_utt=max_sym_per_utt)
         return y
 
-    def freeze_feat_fuser(self):
-        if self.feat_fuser is None:
-            return
+    # def freeze_feat_fuser(self):
+    #     if self.feat_fuser is None:
+    #         return
 
-        if self.feat_fusion_method == "weighted-avg":
-            self.feat_fuser.requires_grad = False
-            return
+    #     if self.feat_fusion_method_transducer == "weighted-avg":
+    #         self.feat_fuser.requires_grad = False
+    #         return
 
-        for param in self.feat_fuser.parameters():
-            param.requires_grad = False
+    #     for param in self.feat_fuser.parameters():
+    #         param.requires_grad = False
 
     def freeze_hf_feats(self):
         self.hf_feats.freeze()
@@ -341,6 +382,7 @@ def filter_args(**kwargs):
             "feat_fusion_method",
             "loss_weight_transducer",
             "loss_weight_lid",
+            "languageid",
         )
         args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
         return args
@@ -357,9 +399,11 @@ def get_config(self):
             "transducer": tran_cfg,
             "languageid": lid_cfg,
             "feat_fusion_start": self.feat_fusion_start,
-            "feat_fusion_method": self.feat_fusion_method,
+            "feat_fusion_method_transducer": self.feat_fusion_method_transducer,
+            "feat_fusion_method_lid": self.feat_fusion_method_lid,
             "loss_weight_transducer": self.loss_weight_transducer,
             "loss_weight_lid": self.loss_weight_lid,
+            "lid_length": self.lid_length,
         }
 
         base_config = super().get_config()
@@ -395,25 +439,6 @@ def add_class_args(parser, prefix=None, skip=set()):
                   "in [weighted-avg, linear, cat, last]"),
         )
 
-        parser.add_argument(
-            "--loss-weight-transducer",
-            default=0.005,
-            type=float,
-            help="""
-            The weight of the transducer loss
-            """,
-        )
-
-        parser.add_argument(
-            "--loss-weight-lid",
-            default=1.0,
-            type=float,
-            help="""
-            The weight of the lid loss
-            """,
-        )
-
-
 
         if prefix is not None:
             outer_parser.add_argument(
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
index 4fa19144..10bdc53b 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
@@ -39,15 +39,19 @@ def __init__(
         transducer: Union[Dict, RNNTransducer],
         languageid: Union[Dict, ResNet1dLanguageID],
         feat_fusion_start: int = 0,
-        feat_fusion_method: str = "weighted-avg",
+        feat_fusion_method_transducer: str = "weighted-avg",
+        feat_fusion_method_languageid: str = "weighted-avg",
+        loss_weight_transducer: float = 0.005,
+        loss_weight_lid: float = 1.0,
+        lid_length: float = 3.0,
     ):
 
-        # if isinstance(hf_feats, dict):
-        #     if "class_name" in hf_feats:
-        #         del hf_feats["class_name"]
-        #     hf_feats = HFWav2Vec2(**hf_feats)
-        # else:
-        #     assert isinstance(hf_feats, HFWav2Vec2)
+        if isinstance(hf_feats, dict):
+            if "class_name" in hf_feats:
+                del hf_feats["class_name"]
+            hf_feats = HFWav2Vec2(**hf_feats)
+        else:
+            assert isinstance(hf_feats, HFWav2Vec2)
 
         # if isinstance(languageid, dict):
         #     languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
@@ -64,7 +68,7 @@ def __init__(
 
 
         super().__init__(hf_feats, transducer, languageid, feat_fusion_start,
-                         feat_fusion_method)
+                         feat_fusion_method_transducer, feat_fusion_method_languageid, loss_weight_transducer, loss_weight_lid, lid_length)
 
     @staticmethod
     def filter_args(**kwargs):
@@ -96,6 +100,12 @@ def add_class_args(parser, prefix=None):
     @staticmethod
     def filter_finetune_args(**kwargs):
         base_args = {}
+
+        valid_args = (
+            "loss_weight_transducer",
+            "loss_weight_lid",
+            "lid_length",
+        )
         child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
         base_args["hf_feats"] = child_args
         child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"])
@@ -110,6 +120,33 @@ def add_finetune_args(parser, prefix=None):
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
+        parser.add_argument(
+            "--loss-weight-transducer",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the transducer loss
+            """,
+        )
+
+        parser.add_argument(
+            "--loss-weight-lid",
+            default=1.0,
+            type=float,
+            help="""
+            The weight of the lid loss
+            """,
+        )
+
+        parser.add_argument(
+            "--lid-length",
+            default=3.0,
+            type=float,
+            help="""
+            The length of the chunks for language id
+            """,
+        )
+
         HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats")
         RNNTransducer.add_finetune_args(parser, prefix="transducer")
         ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid")
diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py
index 2e9df702..8a06ebda 100644
--- a/hyperion/torch/trainers/transducer_languageid_trainer.py
+++ b/hyperion/torch/trainers/transducer_languageid_trainer.py
@@ -135,9 +135,8 @@ def train_epoch(self, data_loader):
             for k, v in output.items():
                 if "loss" in k and v is not None:
                     batch_metrics[k] = output[k].item()
-
             for k, metric in self.metrics.items():
-                batch_metrics[k] = metric(output, target)
+                batch_metrics[k] = metric(output["logits"], languageid)
 
             metric_acc.update(batch_metrics, batch_size)
             logs = metric_acc.metrics

From 29fdfb7e45bd089de6a5fbfaf50cf11efebcbd03 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-70-7.ec2.internal>
Date: Sun, 21 May 2023 05:55:00 +0000
Subject: [PATCH 31/89] add asr_lid run script

---
 egs/commonvoice/v1/run_020_train_asr_lid.sh | 140 ++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100755 egs/commonvoice/v1/run_020_train_asr_lid.sh

diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh
new file mode 100755
index 00000000..67ee65d4
--- /dev/null
+++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+# export CUDA_VISIBLE_DEVICES=0
+
+#ml purge
+#module load namd/2.14-cuda-smp
+#module load cuda/11.6.0
+#ml
+#nvidia-smi
+#export CUDA_VISIBLE_DEVICES=0,1,2,3
+#export CONV_RSH=ssh
+#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
+
+
+stage=1
+ngpu=4
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_dir=data/${nnet_data}/
+val_dir=data/${dev_data}/
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+    extra_args="--data.val.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+if [ "$use_wandb" == "true" ];then
+  extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)"
+fi
+
+
+# # Network Training
+# if [ $stage -le 1 ]; then
+
+#   mkdir -p $nnet_s1_dir/log
+#   $cuda_cmd \
+#     --gpu $ngpu $nnet_s1_dir/log/train.log \
+#     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+#     train_wav2vec2rnn_transducer.py $nnet_type \
+#     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+#     --data.train.dataset.audio-file $train_dir/wav.scp \
+#     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+#     --data.train.dataset.class-names "language" \
+#     --data.train.dataset.class-files $train_dir/langs \
+#     --data.train.dataset.bpe-model $bpe_model \
+#     --data.train.dataset.text-file $train_dir/text \
+#     --data.val.dataset.audio-file $val_dir/wav.scp \
+#     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+#     --data.val.dataset.class-names "language" \
+#     --data.val.dataset.class-files $train_dir/langs \
+#     --data.val.dataset.text-file $val_dir/text \
+#     --trainer.exp-path $nnet_s1_dir $args \
+#     --data.train.dataset.time-durs-file $train_dir/utt2dur \
+#     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+#     --num-gpus $ngpu
+
+# fi
+
+if [ $stage -le 2 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer_languageid.py $nnet_type \
+    --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s2_dir $args \
+    --in-model-transducer $nnet_transducer \
+    --in-model-lid $nnet_lid \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+  
+fi
+
+if [ $stage -le 3 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  
+
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer.py $nnet_type \
+    --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s3_dir $args \
+    --in-model-file $nnet_s2 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+fi
+

From 3d33522319c94b7b5ab3f7a1631a8e64260a7e57 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-118-18.ec2.internal>
Date: Sun, 21 May 2023 13:49:09 +0000
Subject: [PATCH 32/89] update film model

---
 ...2base_rnnt_film_k2_pruned_stage1_v2.0.yaml |  91 +++++++++++
 ...g_pruned_filmed_transducer_v2.0_13langs.sh |  50 ++++++
 hyperion/torch/layer_blocks/__init__.py       |   2 +-
 hyperion/torch/layer_blocks/film_blocks.py    |  69 +++++++-
 .../layer_blocks/transducer_film_predictor.py | 152 +++++++++++++++++-
 .../narchs/rnn_film_transducer_decoder.py     |   7 +-
 6 files changed, 358 insertions(+), 13 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml
new file mode 100644
index 00000000..ba71c8ff
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml
@@ -0,0 +1,91 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm_residual
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
new file mode 100644
index 00000000..e056cf03
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
@@ -0,0 +1,50 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe
+nnet_s2_transducer_name=$nnet_transducer_name.s2
+nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name
+nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v2.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py
index 62c096b2..61d97285 100644
--- a/hyperion/torch/layer_blocks/__init__.py
+++ b/hyperion/torch/layer_blocks/__init__.py
@@ -9,7 +9,7 @@
 from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock
 from .etdnn_blocks import ETDNNBlock
 from .fc_blocks import FCBlock
-from .film_blocks import FiLM, LSTMWithFiLM
+from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual
 from .mbconv_blocks import MBConvBlock, MBConvInOutBlock
 from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock
 from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock
diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
index 5caeab76..9503fcfe 100644
--- a/hyperion/torch/layer_blocks/film_blocks.py
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -10,6 +10,7 @@ def __init__(self, input_size, condition_size):
         self.linear_shift = nn.Linear(condition_size, input_size)
 
     def forward(self, x, lang_condition):
+        # import pdb; pdb.set_trace()
         if x.ndim == 3:
             gamma = self.linear_scale(lang_condition).unsqueeze(1).expand_as(x)
             beta = self.linear_shift(lang_condition).unsqueeze(1).expand_as(x)
@@ -22,28 +23,80 @@ def forward(self, x, lang_condition):
 
 
 
-class LSTMWithFiLM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True):
-        super(LSTMWithFiLM, self).__init__()
+class RNNWithFiLM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm"):
+        super(RNNWithFiLM, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.dropout = dropout
-        self.batch_first = batch_first
+        self.batch_first = batch_first 
+        self.rnn_type = rnn_type
+        if self.rnn_type == "lstm":
+            self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
+        elif self.rnn_type == "gru":
+            self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
+        self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
+        self.dropout_layer = nn.Dropout(dropout)
+
+    def forward(self, x, states, lang_condition):
+        outputs = []
+        new_h, new_c = [], []
+        if self.rnn_type == "lstm":
+            rnns = self.lstms
+        elif self.rnn_type == "gru":
+            rnns = self.grus
+
+        for i, (rnn, film) in enumerate(zip(rnns, self.films)):
+            if states:
+                x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0)))
+            else:
+                x, (h_i, c_i) = rnn(x)
+            x = film(x, lang_condition)
+            new_h.append(h_i)
+            new_c.append(c_i)
+            if i != self.num_layers - 1:
+                x = self.dropout_layer(x)
+            outputs.append(x)
+        new_h = torch.cat(new_h, dim=0)
+        new_c = torch.cat(new_c, dim=0)
+        return x, (new_h, new_c)
 
-        self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
+
+class RNNWithFiLMResidual(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual"):
+        super(RNNWithFiLMResidual, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.batch_first = batch_first 
+        self.rnn_type = rnn_type
+        if self.rnn_type == "lstm_residual":
+            self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
+        elif self.rnn_type == "gru_residual":
+            self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
         self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
         self.dropout_layer = nn.Dropout(dropout)
 
     def forward(self, x, states, lang_condition):
         outputs = []
         new_h, new_c = [], []
-        for i, (lstm, film) in enumerate(zip(self.lstms, self.films)):
+
+        if self.rnn_type == "lstm_residual":
+            rnns = self.lstms
+        elif self.rnn_type == "gru_residual":
+            rnns = self.grus
+            
+        for i, (rnn, film) in enumerate(zip(rnns, self.films)):
             if states:
-                x, (h_i, c_i) = lstm(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0)))
+                x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0)))
             else:
-                x, (h_i, c_i) = lstm(x)
+                x, (h_i, c_i) = rnn(x)
             x = film(x, lang_condition)
+            if i != 0:
+                x = x + residual
+            residual = x
             new_h.append(h_i)
             new_c.append(c_i)
             if i != self.num_layers - 1:
diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py
index dbb93218..cb628a2c 100644
--- a/hyperion/torch/layer_blocks/transducer_film_predictor.py
+++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py
@@ -12,7 +12,7 @@
 
 from ...utils.misc import filter_func_args
 from ..layers import ActivationFactory as AF
-from .film_blocks import FiLM, LSTMWithFiLM
+from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual
 
 class TransducerRNNFiLMPredictor(nn.Module):
     """ RNN-T prediction network with LSTM or GRU
@@ -46,14 +46,25 @@ def __init__(self,
             padding_idx=blank_id,
         )
         self.embed_dropout = nn.Dropout(embed_dropout_rate)
-        if rnn_type == "lstm":
-            self.rnn = LSTMWithFiLM(
+        if rnn_type in ["lstm","gru"]:
+            self.rnn = RNNWithFiLM(
                 input_size=embed_dim,
                 hidden_size=hid_feats,
                 num_layers=num_layers,
                 dropout=rnn_dropout_rate,
                 condition_size=condition_size,
                 batch_first=True,
+                rnn_type=rnn_type
+            )
+        elif rnn_type in ["lstm_residual","gru_residual"]:
+            self.rnn = RNNWithFiLMResidual(
+                input_size=embed_dim,
+                hidden_size=hid_feats,
+                num_layers=num_layers,
+                dropout=rnn_dropout_rate,
+                condition_size=condition_size,
+                batch_first=True,
+                rnn_type=rnn_type
             )
         else:
             raise Exception(f"Unknown RNN type {rnn_type}")
@@ -126,3 +137,138 @@ def change_config(
             self.rnn.p = self.rnn_dropout_rate
             self.embed_dropout_rate = embed_dropout_rate
             self.embed_dropout = nn.Dropout(self.embed_dropout_rate)
+
+class TransducerConvPredictor(nn.Module):
+    """ RNN-T prediction network based on Convolutions
+    Implmentation  based on:
+    https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/decoder.py
+
+    Attributes:
+      vocab_size: Number of tokens of the modeling unit including blank.
+      embed_dim: Dimension of the input embedding.
+      blank_id: The ID of the blank symbol.
+      out_feats: Output dimension of the predictor.
+      embed_dropout_rate: Dropout rate for the embedding layer.
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_dim: int,
+        condition_size: int,
+        out_feats: Optional[int] = None,
+        context_size: int = 2,
+        embed_dropout_rate: float = 0.0,
+        hid_act: str = "relu",
+        blank_id: int = 0,
+    ):
+        super().__init__()
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=embed_dim,
+            padding_idx=blank_id,
+        )
+        self.embed_dropout = nn.Dropout(embed_dropout_rate)
+        assert context_size >= 1, context_size
+        if context_size > 1:
+            self.conv = nn.Conv1d(
+                in_channels=embed_dim,
+                out_channels=embed_dim,
+                kernel_size=context_size,
+                padding=0,
+                groups=out_feats // 4,
+                bias=False,
+            )
+
+        self.blank_id = blank_id
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.embed_dropout_rate = embed_dropout_rate
+        self.context_size = context_size
+        self.hid_act = AF.create(hid_act)
+
+        if out_feats is None:
+            out_feats = embed_dim
+
+        self.out_feats = out_feats
+        if out_feats != embed_feats:
+            self.output_proj = nn.Linear(embed_dim, out_feats)
+        else:
+            self.output_proj = None
+
+    def get_config(self):
+        hid_act = AF.get_config(self.hid_act)
+        config = {
+            "pred_type": "conv",
+            "vocab_size": self.vocab_size,
+            "embed_dim": self.embed_dim,
+            "out_feats": self.out_feats,
+            "context_size": self.context_size,
+            "embed_dropout_rate": self.embed_dropout_rate,
+            "blank_id": self.blank_id,
+            "hid_act": hid_act,
+        }
+        return config
+
+    def forward(
+        self,
+        y: torch.Tensor,
+        states: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+          y:
+            A 2-D tensor of shape (N, U).
+          # need_pad:
+          #   True to left pad the input. Should be True during training.
+          #   False to not pad the input. Should be False during inference.
+        Returns:
+          Return a tensor of shape (N, U, decoder_dim).
+        """
+        y = y.to(torch.int64)
+        embed = self.embedding(y)
+        if self.context > 1:
+            embed = embed.transpose(1, 2)
+            if states is None:
+                embed = F.pad(embedding_out, pad=(self.context_size - 1, 0))
+            else:
+                raise NotImplementedError()
+            embed = self.conv(embed).transpose(1, 2)
+
+        out = self.hid_act(embed)
+        if self.output_proj:
+            out = self.output_proj(out)
+
+        return out, None
+
+        # # this stuff about clamp() is a temporary fix for a mismatch
+        # # at utterance start, we use negative ids in beam_search.py
+        # if torch.jit.is_tracing():
+        #     # This is for exporting to PNNX via ONNX
+        #     embedding_out = self.embedding(y)
+        # else:
+        #     embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1)
+        # if self.context_size > 1:
+        #     embedding_out = embedding_out.permute(0, 2, 1)
+        #     if need_pad is True:
+        #         embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
+        #     else:
+        #         # During inference time, there is no need to do extra padding
+        #         # as we only need one output
+        #         assert embedding_out.size(-1) == self.context_size
+        #     embedding_out = self.conv(embedding_out)
+        #     embedding_out = embedding_out.permute(0, 2, 1)
+        # embedding_out = F.relu(embedding_out)
+        # return embedding_out
+
+    def change_config(
+        self,
+        override_dropouts=False,
+        embed_dropout_rate: float = 0.0,
+    ):
+        logging.info("changing predictor config")
+
+        if override_dropouts:
+            logging.info("overriding predictor dropouts")
+            self.embed_dropout_rate = embed_dropout_rate
+            self.embed_dropout = nn.Dropout(self.embed_dropout_rate)
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 91a30caf..2797d5a3 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -750,7 +750,6 @@ def add_class_args(parser,
                                 type=int,
                                 required=True,
                                 help=("output prediction dimension"))
-
         RNNFiLMTransducerDecoder.add_pred_args(parser)
         RNNFiLMTransducerDecoder.add_joiner_args(parser)
         parser.add_argument(
@@ -782,6 +781,12 @@ def add_class_args(parser,
             type=Optional[int],
             help="""how many symbols to keep for each frame in k2 rnn-t 
             pruned loss.""")
+
+        parser.add_argument("--condition-size",
+                            type=int,
+                            required=True,
+                            help=("condition vector dimension"))
+
         parser.add_argument(
             "--lm-scale",
             default=0.25,

From a5971ab3655f073ae48ef90272ebfe7cdf9f8b24 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sun, 21 May 2023 18:21:48 -0400
Subject: [PATCH 33/89] update transducer_languageid joint model

---
 .../hf_wav2rnn_transducer_languageid.py       | 62 +++++++++++--------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index 90211ec9..b4f3b7dd 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -112,7 +112,7 @@ def _make_fuser(self, method):
         return feat_fuser
 
 
-    def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser):
+    def _fuse_hid_feats(self, hid_feats):
         """Fuses the hidden features from the Wav2Vec model.
 
         Args:
@@ -126,20 +126,24 @@ def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser):
             return hid_feats[0]
 
         hid_feats = hid_feats[self.feat_fusion_start:]
-        if feat_fusion_method == "weighted-avg":
+        if self.feat_fusion_method_transducer == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            norm_weights = nn.functional.softmax(feat_fuser, dim=-1)
-            feats = torch.sum(hid_feats * norm_weights, dim=-1)
-        elif feat_fusion_method == "linear":
+            norm_transducer_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
+            norm_lid_weights = nn.functional.softmax(self.languageid_fuser, dim=-1)
+            feats_transducer = torch.sum(hid_feats * norm_transducer_weights, dim=-1)
+            feats_languageid = torch.sum(hid_feats * norm_lid_weights, dim=-1)
+        elif self.feat_fusion_method_transducer == "linear":
             hid_feats = torch.stack(hid_feats, dim=-1)
-            feats = feat_fuser(hid_feats).squeeze(dim=-1)
-        elif feat_fusion_method == "cat":
+            feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1)
+            feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method_transducer == "cat":
             hid_feats = torch.cat(hid_feats, dim=-1)
-            feats = feat_fuser(hid_feats)
-        elif feat_fusion_method == "last":
+            feats_transducer = self.transducer_fuser(hid_feats)
+            feats_languageid = self.languageid_fuser(hid_feats)
+        elif self.feat_fusion_method_transducer == "last":
             feats = hid_feats[-1]
 
-        return feats
+        return feats_transducer, feats_languageid
 
     def forward_feats(self,
                       x,
@@ -160,8 +164,8 @@ def forward_feats(self,
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
-            feats_transducer = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_transducer, self.transducer_fuser)
-            feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser)
+            feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats)
+            # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser)
         else:
             hid_feats = None
             feats_transducer = hf_output["last_hidden_state"]
@@ -181,23 +185,23 @@ def forward_feats(self,
 
         return feats_transducer, feats_languageid, hid_feats, feat_lengths
             
-    def languageid_chunk(self, feats, lengths):
-        sr = self.hf_feats.get_config()["sample_frequency"]
-        strides = self.hf_feats.get_config()["conv_stride"]
+    # def languageid_chunk(self, feats, lengths):
+    #     sr = self.hf_feats.get_config()["sample_frequency"]
+    #     strides = self.hf_feats.get_config()["conv_stride"]
         
-        total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32))
+    #     total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32))
 
-        chunk_length = int(self.lid_length * sr / total_stride)
+    #     chunk_length = int(self.lid_length * sr / total_stride)
 
-        # Check if all samples are longer than chunk_length
-        if any(len < chunk_length for len in lengths):
-            return feats
+    #     # Check if all samples are longer than chunk_length
+    #     if any(len < chunk_length for len in lengths):
+    #         return feats
 
-        start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths]
+    #     start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths]
 
-        chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)])
+    #     chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)])
 
-        return chunks
+    #     return chunks
 
 
     def forward(
@@ -231,8 +235,14 @@ def forward(
         """
         feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats(
             x, x_lengths, return_feat_layers)
+        
+        lid_len = int(self.lid_length * 50)
+        lid_start = torch.randint(0, torch.min(feat_lengths).item() - lid_len + 1, (1,)).item()
+
+        feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len]
+
 
-        feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths)
+        # feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths)
 
         feats_transducer = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
             
@@ -261,8 +271,8 @@ def forward(
                                                 loss_lid=loss_lid,
                                                 loss_transducer_simple=trans_output.loss_simple, 
                                                 loss_transducer_pruned=trans_output.loss_pruned,
-                                                h_feats=trans_output.h_feats,
-                                                logits=logits if return_logits else None)
+                                                h_feats=trans_output.h_feats)
+                                                #logits=[logit.item() for logit in logits] if return_logits else None)
         return output
 
     def infer(self,

From 92f33d3c5d9cbfe02afa67c589236f78f622e420 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-69-192.ec2.internal>
Date: Mon, 22 May 2023 01:20:09 +0000
Subject: [PATCH 34/89] update ASR and LID joint training code

---
 ...ransducer_ecapadnn1024x3_stage1_v1.0.yaml} |  60 +++-
 ...nfig_pruned_transducer_lid_v1.0_13langs.sh |  26 +-
 .../v1/local/initailize_joint_model.py        |  56 ++++
 egs/commonvoice/v1/run_020_train_asr_lid.sh   |  53 ++--
 .../identificate_wav2vec2resnet1d.sh          |   2 +-
 ...train_wav2vec2rnn_transducer_languageid.py | 270 ++++++++++++++++++
 .../hf_wav2rnn_transducer_languageid.py       |  59 +++-
 .../hf_wav2vec2rnn_transducer_languageid.py   |   8 +-
 .../trainers/transducer_languageid_trainer.py |   2 +-
 9 files changed, 477 insertions(+), 59 deletions(-)
 rename egs/commonvoice/v1/conf/{train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml => train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml} (59%)
 create mode 100644 egs/commonvoice/v1/local/initailize_joint_model.py
 create mode 100755 hyperion/bin/train_wav2vec2rnn_transducer_languageid.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
similarity index 59%
rename from egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
rename to egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
index 972f7c1c..dfc64d75 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
@@ -19,7 +19,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.0002
+      num_chunks_per_seg_epoch: 0.1
 
     data_loader:
       num_workers: 8
@@ -43,22 +43,72 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.01
+      num_chunks_per_seg_epoch: 1.0
     data_loader:
       num_workers: 8
 model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
   transducer:
     decoder:
       prune_range: 15
-      override_dropouts: false
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+
   languageid:
-    # resnet_enc:
-      # num_classes: 13
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+        - 1
+        - 1
+      resb_channels:
+        - 1024
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+        - 3
+        - 4
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: subcenter-arc-softmax
     cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
   
   loss_weight_transducer: 0.005
   loss_weight_lid: 1.0
   lid_length: 3.0
+  # feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
 
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
index b4437442..aaafecc1 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
@@ -12,10 +12,10 @@ dev_data=13_langs_dev_proc_audio
 test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
-language=13_langs
+language=13_langs_weighted
 
 # bpe_model=data/13_langs_lang_bpe_4000/bpe.model
-bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
 # bpe_model=data/13_langs_lang_bpe_16000/bpe.model
 # x-vector cfg
 
@@ -24,17 +24,23 @@ nnet_type=hf_wav2vec2rnn_transducer_resnet1d
 # nnet_s1_transducer_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml
 # nnet_s1_transducer_args=""
 
-nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2
-nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name
-nnet_transducer=$nnet_transducer_dir/model_ep0008.pth
+# nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2
+# nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name
+# nnet_transducer=$nnet_transducer_dir/model_ep0008.pth
 
-nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3
-nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name
-nnet_lid=$nnet_lid_dir/model_ep0003.pth
+# nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3
+# nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name
+# nnet_lid=$nnet_lid_dir/model_ep0003.pth
 
-nnet_name=${hf_model_name}_rnnt_k2_pruned_resnet1d.v1.0_13_langs_8000_bpe
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
+nnet_s1_args=""
 
-nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml
+nnet_name=${hf_model_name}_rnnt_k2_pruned_transducer_ecapadnn1024x3.v1.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
diff --git a/egs/commonvoice/v1/local/initailize_joint_model.py b/egs/commonvoice/v1/local/initailize_joint_model.py
new file mode 100644
index 00000000..fd98d3f2
--- /dev/null
+++ b/egs/commonvoice/v1/local/initailize_joint_model.py
@@ -0,0 +1,56 @@
+import torch
+import sys
+# arguments example
+#
+
+ASR_model = torch.load(sys.argv[1])
+LID_model = torch.load(sys.argv[2])
+joint_model = torch.load(sys.argv[3])
+
+output_model = sys.argv[4]
+
+
+def check_update_parameters(joint_state_dict, new_joint_state_dict):
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in joint_state_dict.items():
+        new_param = new_joint_state_dict[name].to(param.device)
+        if torch.all(torch.eq(param, new_param)):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+    print("Unchanged parameters: {}".format(unchanged_parameters))
+    print("Changed parameters: {}".format(changed_parameters))
+
+
+
+def copy_model_parameters(ASR_model, LID_model, joint_model, output_model):
+    ASR_state_dict = ASR_model["model_state_dict"]
+    LID_state_dict = LID_model["model_state_dict"]
+    joint_state_dict = joint_model["model_state_dict"]
+
+    hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name}
+    transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name}
+    languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name}
+    
+    
+    new_joint_state_dict = joint_state_dict.copy()
+    new_joint_state_dict.update(hf_feats_update_state_dict)
+    new_joint_state_dict.update(transducer_update_state_dict)
+    new_joint_state_dict.update(languageid_update_state_dict)
+    # import pdb;pdb.set_trace()
+    
+    new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"]
+    new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"]
+
+    
+    joint_model["model_state_dict"] = new_joint_state_dict
+    joint_model["epoch"] =1
+
+    check_update_parameters(joint_state_dict, new_joint_state_dict)
+    torch.save(joint_model, output_model)
+
+
+
+copy_model_parameters(ASR_model, LID_model, joint_model, output_model)
\ No newline at end of file
diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh
index 67ee65d4..4b312e76 100755
--- a/egs/commonvoice/v1/run_020_train_asr_lid.sh
+++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh
@@ -52,32 +52,33 @@ if [ "$use_wandb" == "true" ];then
 fi
 
 
-# # Network Training
-# if [ $stage -le 1 ]; then
-
-#   mkdir -p $nnet_s1_dir/log
-#   $cuda_cmd \
-#     --gpu $ngpu $nnet_s1_dir/log/train.log \
-#     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
-#     train_wav2vec2rnn_transducer.py $nnet_type \
-#     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-#     --data.train.dataset.audio-file $train_dir/wav.scp \
-#     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
-#     --data.train.dataset.class-names "language" \
-#     --data.train.dataset.class-files $train_dir/langs \
-#     --data.train.dataset.bpe-model $bpe_model \
-#     --data.train.dataset.text-file $train_dir/text \
-#     --data.val.dataset.audio-file $val_dir/wav.scp \
-#     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
-#     --data.val.dataset.class-names "language" \
-#     --data.val.dataset.class-files $train_dir/langs \
-#     --data.val.dataset.text-file $val_dir/text \
-#     --trainer.exp-path $nnet_s1_dir $args \
-#     --data.train.dataset.time-durs-file $train_dir/utt2dur \
-#     --data.val.dataset.time-durs-file $val_dir/utt2dur \
-#     --num-gpus $ngpu
-
-# fi
+# Network Training
+if [ $stage -le 1 ]; then
+
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+    train_wav2vec2rnn_transducer_languageid.py $nnet_type \
+    --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s1_dir $args \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1238 \
+    --num-gpus $ngpu
+
+fi
 
 if [ $stage -le 2 ]; then
 
diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
index 8b31ac2f..5a2bbc27 100755
--- a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
+++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
@@ -78,7 +78,7 @@ if [ $stage -le 1 ];then
 
   cat $output_dir/languageid.* > $output_dir/langs
 
-  # python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
+  python steps_lid/lid_score.py $output_dir/langs >> $output_dir/scores
   # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
 
   # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
diff --git a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py
new file mode 100755
index 00000000..85689ac3
--- /dev/null
+++ b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer,
+                                   HFWav2Vec2RNNTransducerResnet1D)
+from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
+
+model_dict = {
+    "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D,
+}
+
+
+def transducer_language_collate(batch):
+    audio = []
+    audio_length = []
+    target = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        target.append(record["text"])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    # FiLM: add language ID to the input
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": target,
+        "language": language,
+    }
+    return batch
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = ({
+        "num_workers": num_workers_per_gpu,
+        "pin_memory": True
+    } if num_gpus > 0 else {})
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=sampler,
+                                              **largs,
+                                              collate_fn=transducer_language_collate)
+    return data_loader
+
+def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs):
+    model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network args={}".format(model_args))
+    # TODO: check model_args
+    model_args["transducer"]["decoder"]["blank_id"] = blank_id
+    model_args["transducer"]["decoder"]["vocab_size"] = vocab_size
+    model_args["languageid"]["num_classes"] = num_classes
+    model = model_class(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+    #torch.backends.cudnn.deterministic = True
+    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.enabled = False
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = torch.device("cuda:{}".format(gpu_id))
+    # world_size=1
+
+    # import pdb; pdb.set_trace()
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    # model = init_model_from_transducer(**kwargs)
+    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
+                       train_loader.dataset.sp.get_piece_size(),
+                       list(train_loader.dataset.num_classes.values())[0],
+                        **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    # import pdb; pdb.set_trace()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train",
+                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str,
+    )
+
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str,
+    )
+
+    parser.link_arguments("data.train.data_loader.num_workers",
+                          "data.val.data_loader.num_workers")
+
+    parser.link_arguments("data.train.dataset.bpe_model",
+                          "data.val.dataset.bpe_model")
+
+    # parser.add_argument("--in-model-file", required=True)
+    model_class.add_class_args(parser, prefix="model")
+
+    Trainer.add_class_args(parser,
+                           prefix="trainer",
+                           train_modes=model_class.valid_train_modes())
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1123581321,
+                        help="random seed")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Train Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index b4f3b7dd..8c7d54d7 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -47,7 +47,7 @@ def __init__(self,
                  languageid: Union[Dict, TorchModel],
                  feat_fusion_start: int = 0,
                  feat_fusion_method_transducer: str = "weighted-avg",
-                 feat_fusion_method_languageid: str = "weighted-avg",
+                 feat_fusion_method_lid: str = "weighted-avg",
                  loss_weight_transducer: float = 0.005,
                  loss_weight_lid: float = 1.0,
                  lid_length: float = 3.0,
@@ -84,13 +84,13 @@ def __init__(self,
 
         self.feat_fusion_start = feat_fusion_start
         self.feat_fusion_method_transducer = feat_fusion_method_transducer
-        self.feat_fusion_method_languageid = feat_fusion_method_languageid
+        self.feat_fusion_method_lid = feat_fusion_method_lid
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
         self.lid_length = lid_length
         self._hf_context = contextlib.nullcontext()
         self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer)
-        self.languageid_fuser = self._make_fuser(self.feat_fusion_method_languageid)
+        self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid)
 
     def _make_fuser(self, method):
         if method == "last":
@@ -165,7 +165,7 @@ def forward_feats(self,
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
             feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats)
-            # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser)
+            # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_lid, self.languageid_fuser)
         else:
             hid_feats = None
             feats_transducer = hf_output["last_hidden_state"]
@@ -235,11 +235,12 @@ def forward(
         """
         feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats(
             x, x_lengths, return_feat_layers)
-        
-        lid_len = int(self.lid_length * 50)
-        lid_start = torch.randint(0, torch.min(feat_lengths).item() - lid_len + 1, (1,)).item()
 
-        feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len]
+        lid_len = int(self.lid_length * 50)
+        min_len = torch.min(feat_lengths).item()
+        if min_len > lid_len:
+            lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item()
+            feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len]
 
 
         # feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths)
@@ -271,8 +272,8 @@ def forward(
                                                 loss_lid=loss_lid,
                                                 loss_transducer_simple=trans_output.loss_simple, 
                                                 loss_transducer_pruned=trans_output.loss_pruned,
-                                                h_feats=trans_output.h_feats)
-                                                #logits=[logit.item() for logit in logits] if return_logits else None)
+                                                h_feats=trans_output.h_feats,
+                                                logits=logits if return_logits else None)
         return output
 
     def infer(self,
@@ -389,7 +390,8 @@ def filter_args(**kwargs):
             "hf_feats",
             "transducer",
             "feat_fusion_start",
-            "feat_fusion_method",
+            "feat_fusion_method_transducer",
+            "feat_fusion_method_lid",
             "loss_weight_transducer",
             "loss_weight_lid",
             "languageid",
@@ -442,13 +444,46 @@ def add_class_args(parser, prefix=None, skip=set()):
             the wav2vec num_layers""",
         )
         parser.add_argument(
-            "--feat-fusion-method",
+            "--feat-fusion-method-transducer",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=("method to fuse the hidden layers from the wav2vec model "
+                  "in [weighted-avg, linear, cat, last]"),
+        )
+        parser.add_argument(
+            "--feat-fusion-method-lid",
             default="weighted-avg",
             choices=["weighted-avg", "linear", "cat", "last"],
             help=("method to fuse the hidden layers from the wav2vec model "
                   "in [weighted-avg, linear, cat, last]"),
         )
 
+        parser.add_argument(
+            "--loss-weight-transducer",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the transducer loss
+            """,
+        )
+
+        parser.add_argument(
+            "--loss-weight-lid",
+            default=1.0,
+            type=float,
+            help="""
+            The weight of the lid loss
+            """,
+        )
+
+        parser.add_argument(
+            "--lid-length",
+            default=3.0,
+            type=float,
+            help="""
+            The length of the chunks for language id
+            """,
+        )
 
         if prefix is not None:
             outer_parser.add_argument(
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
index 10bdc53b..c8cd974b 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
@@ -40,7 +40,7 @@ def __init__(
         languageid: Union[Dict, ResNet1dLanguageID],
         feat_fusion_start: int = 0,
         feat_fusion_method_transducer: str = "weighted-avg",
-        feat_fusion_method_languageid: str = "weighted-avg",
+        feat_fusion_method_lid: str = "weighted-avg",
         loss_weight_transducer: float = 0.005,
         loss_weight_lid: float = 1.0,
         lid_length: float = 3.0,
@@ -68,7 +68,7 @@ def __init__(
 
 
         super().__init__(hf_feats, transducer, languageid, feat_fusion_start,
-                         feat_fusion_method_transducer, feat_fusion_method_languageid, loss_weight_transducer, loss_weight_lid, lid_length)
+                         feat_fusion_method_transducer, feat_fusion_method_lid, loss_weight_transducer, loss_weight_lid, lid_length)
 
     @staticmethod
     def filter_args(**kwargs):
@@ -76,8 +76,8 @@ def filter_args(**kwargs):
         child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"])
         base_args["hf_feats"] = child_args
         child_args = RNNTransducer.filter_args(**kwargs["transducer"])
-        child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"])
         base_args["transducer"] = child_args
+        child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"])
         base_args["languageid"] = child_args
         return base_args
 
@@ -91,7 +91,7 @@ def add_class_args(parser, prefix=None):
         RNNTransducer.add_class_args(parser, prefix="transducer")
         # HFWav2RNNTransducer.add_class_args(parser)
         ResNet1dLanguageID.add_class_args(parser, prefix="languageid")
-        # HFWav2LanguageID.add_class_args(parser)
+        HFWav2RNNTransducerLanguageID.add_class_args(parser)
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix,
diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py
index 8a06ebda..d38ab9a9 100644
--- a/hyperion/torch/trainers/transducer_languageid_trainer.py
+++ b/hyperion/torch/trainers/transducer_languageid_trainer.py
@@ -192,7 +192,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
                         batch_metrics[k] = output[k].item()
 
                 for k, metric in self.metrics.items():
-                    batch_metrics[k] = metric(output, target)
+                    batch_metrics[k] = metric(output["logits"], languageid)
 
                 metric_acc.update(batch_metrics, batch_size)
 

From 190ea29d37d2abecdc6f353e93e6f046dddd29dc Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sun, 21 May 2023 22:00:20 -0400
Subject: [PATCH 35/89] update configuration

---
 ...2base_rnnt_film_k2_pruned_stage2_v2.0.yaml | 76 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v2.0_13langs.sh |  2 +-
 2 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
new file mode 100644
index 00000000..5a1555dd
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
@@ -0,0 +1,76 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40.
+      max_audio_length: 20.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  transducer:
+    decoder:
+      prune_range: 15
+      override_dropouts: false
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
index e056cf03..0f3845d7 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
@@ -34,7 +34,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v2.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0007.pth
+nnet_s1=$nnet_s1_dir/model_ep0009.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
 nnet_s2_args=""

From 16f8b499dce0d6a99ef9f7974b0a38ad2108c3ac Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sun, 21 May 2023 22:55:57 -0400
Subject: [PATCH 36/89] update config with mean pruned rnn loss

---
 ...train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
index 56e08794..faa265a3 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml
@@ -9,14 +9,14 @@ data:
       wav_scale: 1
     sampler:
       sampler_type: 'class_weighted_random_seg_chunk_sampler'
-      min_batch_size: 64
+      min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       # weighted
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 0.3
     data_loader:
       num_workers: 8
   val:
@@ -29,14 +29,14 @@ data:
       wav_scale: 1
     sampler:
       sampler_type: 'class_weighted_random_seg_chunk_sampler'
-      min_batch_size: 64
+      min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       # weighted
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 1.0
     data_loader:
       num_workers: 8
 model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml

From 1ebb2195d18b047fe1e045e9f18d993b380d6701 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sun, 21 May 2023 22:57:26 -0400
Subject: [PATCH 37/89] update config to use mean transducer loss

---
 ...e_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
index dfc64d75..275987d7 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
@@ -53,6 +53,7 @@ model:
     decoder:
       prune_range: 15
       rnnt_loss: k2_pruned
+      reduction: mean
       simple_loss_scale: 0.2
       predictor:
         embed_dim: 1024
@@ -104,7 +105,7 @@ model:
     dropout_rate: 0.3
     hid_act: swish
   
-  loss_weight_transducer: 0.005
+  loss_weight_transducer: 0.05
   loss_weight_lid: 1.0
   lid_length: 3.0
   # feat_fusion_method: weighted-avg

From c474869a61f335d5657b30a22b4f41ec7e36abe7 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-69-192.ec2.internal>
Date: Mon, 22 May 2023 03:00:00 +0000
Subject: [PATCH 38/89] update film parameter name

---
 ..._wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml | 11 ++++++-----
 .../wav2transducer/hf_wav2rnn_film_transducer.py      |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
index 5a1555dd..a9a755ee 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
@@ -10,8 +10,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 40.
-      max_audio_length: 20.
+      max_batch_length: 15.
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: false
       # for class_weighted_random_bucketing_seg_sampler
@@ -19,7 +19,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.3
+      num_chunks_per_seg_epoch: 0.1
 
     data_loader:
       num_workers: 1
@@ -34,8 +34,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 40.
-      max_audio_length: 20.
+      max_batch_length: 15.
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: true
       # for class_weighted_random_bucketing_seg_sampler
@@ -50,6 +50,7 @@ model:
   transducer:
     decoder:
       prune_range: 15
+      reduction: mean
       override_dropouts: false
 trainer:
   optim:
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index dc28abb7..3f44c7c5 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -67,7 +67,7 @@ def _make_fuser(self):
         if self.feat_fusion_method == "film-weighted-avg":
             self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
-        elif self.feat_fusion_method == "weighted-avg-film":
+        elif self.feat_fusion_method == "film-fused-feature":
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
             self.film = FiLM(layer_dim, self.transducer.decoder.condition_size)
         elif self.feat_fusion_method == "weighted-avg":
@@ -102,7 +102,7 @@ def _fuse_hid_feats(self, hid_feats, lang):
             film_hid_feats = torch.stack(film_hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
             feats = torch.sum(film_hid_feats * norm_weights, dim=-1)
-        elif self.feat_fusion_method == "weighted-avg-film":
+        elif self.feat_fusion_method == "film-fused-feature":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
             feats = torch.sum(hid_feats * norm_weights, dim=-1)

From 1f7e70bcbf2f9dd8d83dcc59021897d398d789c1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-114-81.ec2.internal>
Date: Mon, 22 May 2023 04:39:09 +0000
Subject: [PATCH 39/89] update more options for film model

---
 hyperion/torch/narchs/rnn_film_transducer_decoder.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 2797d5a3..cc1dd2e3 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -20,7 +20,8 @@
 
 from ...utils.misc import filter_func_args
 from ...utils.text import add_sos
-from ..layer_blocks import TransducerFiLMJoiner as Joiner
+from ..layer_blocks import TransducerFiLMJoiner as FiLMJoiner
+from ..layer_blocks import TransducerJoiner as Joiner
 from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor
 from .net_arch import NetArch
 
@@ -131,6 +132,11 @@ def _make_joiner(self):
         # Add FiLM args to the joiner args
 
         if joiner_type == "basic":
+            pred_feats = self.predictor_args["out_feats"]
+            hid_feats = self.joiner_args["hid_feats"]
+            self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats,
+                                 self.vocab_size)
+        elif joiner_type == "original_joiner":
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
             self.joiner = Joiner(self.in_feats, pred_feats, hid_feats,

From c6f5dee27519787cc856060372607bc0e4d47280 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-69-192.ec2.internal>
Date: Mon, 22 May 2023 04:43:31 +0000
Subject: [PATCH 40/89] add reduction option in fine-tune argument

---
 hyperion/torch/narchs/rnn_film_transducer_decoder.py |  9 +++++++++
 hyperion/torch/narchs/rnn_transducer_decoder.py      | 10 ++++++++++
 2 files changed, 19 insertions(+)

diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 2797d5a3..976b9872 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -640,12 +640,15 @@ def change_config(
         embed_dropout_rate: float = 0.0,
         rnn_dropout_rate: float = 0.0,
         prune_range: Optional[int] = None,
+        reduction: Optional[str] = None,
     ):
         logging.info("changing decoder config")
         self.predictor.change_config(override_dropouts, embed_dropout_rate,
                                      rnn_dropout_rate)
         if prune_range is not None:
             self.prune_range = prune_range
+        if reduction is not None:
+            self.reduction = reduction
 
     @staticmethod
     def filter_args(**kwargs):
@@ -843,6 +846,12 @@ def add_finetune_args(parser, prefix=None, skip=set()):
             help="""how many symbols to keep for each frame in k2 rnn-t 
             pruned loss.""")
 
+        parser.add_argument(
+            "--reduction",
+            default="sum",
+            choices=["sum", "mean"],
+            help="""type of reduction for rnn-t loss between sum or mean""")
+
         if prefix is not None:
             outer_parser.add_argument("--" + prefix,
                                       action=ActionParser(parser=parser))
diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py
index efc11113..44cf5350 100644
--- a/hyperion/torch/narchs/rnn_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_transducer_decoder.py
@@ -618,12 +618,15 @@ def change_config(
         embed_dropout_rate: float = 0.0,
         rnn_dropout_rate: float = 0.0,
         prune_range: Optional[int] = None,
+        reduction: Optional[str] = None,
     ):
         logging.info("changing decoder config")
         self.predictor.change_config(override_dropouts, embed_dropout_rate,
                                      rnn_dropout_rate)
         if prune_range is not None:
             self.prune_range = prune_range
+        if reduction is not None:
+            self.reduction = reduction
 
     @staticmethod
     def filter_args(**kwargs):
@@ -809,6 +812,13 @@ def add_finetune_args(parser, prefix=None, skip=set()):
                             type=float,
                             help=("dropout prob for decoder RNN "))
 
+
+        parser.add_argument(
+            "--reduction",
+            default="sum",
+            choices=["sum", "mean"],
+            help="""type of reduction for rnn-t loss between sum or mean""")
+            
         parser.add_argument(
             "--prune-range",
             default=5,

From 8e82143904ccef496d71567b37bf609255f0c053 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-114-81.ec2.internal>
Date: Mon, 22 May 2023 19:52:19 +0000
Subject: [PATCH 41/89] update configuration

---
 ...se_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
index 275987d7..43e6ba3a 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml
@@ -105,7 +105,7 @@ model:
     dropout_rate: 0.3
     hid_act: swish
   
-  loss_weight_transducer: 0.05
+  loss_weight_transducer: 0.1
   loss_weight_lid: 1.0
   lid_length: 3.0
   # feat_fusion_method: weighted-avg

From c6ec4e27bdf269159d70aa68b952168db7390408 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-75-228.ec2.internal>
Date: Tue, 23 May 2023 00:53:50 +0000
Subject: [PATCH 42/89] fix film bug

---
 hyperion/torch/narchs/rnn_film_transducer_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 3790065c..e070f70b 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -135,12 +135,12 @@ def _make_joiner(self):
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
             self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats,
-                                 self.vocab_size)
+                                 self.vocab_size, self.condition_size)
         elif joiner_type == "original_joiner":
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
             self.joiner = Joiner(self.in_feats, pred_feats, hid_feats,
-                                 self.vocab_size, self.condition_size)
+                                 self.vocab_size)
         else:
             raise ValueError(f"Unknown joiner type {joiner_type}")
 

From 27878914b1bc20b2dbeb5c1139b6d23f2857cd07 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Thu, 25 May 2023 09:19:23 -0400
Subject: [PATCH 43/89] sre21 8k adapted to persephone branck

---
 egs/sre21-av-a/v1.16k/README.md               |  22 +-
 .../v1.16k/local/score_sre21_official.sh      |   2 +-
 egs/sre21-av-a/v1.8k/README.md                |  53 ++-
 egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh    |   2 +-
 egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh    |   4 +-
 egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh    |   2 +-
 egs/voxceleb/v1.1/local                       |   1 -
 .../{v1 => v1.1}/local/attack_analysis.py     |   0
 .../{v1 => v1.1}/local/attack_analysis.sh     |   0
 .../local/calibrate_voxceleb1_o_clean.sh      |   0
 egs/voxceleb/{v1 => v1.1}/local/make_musan.py |   0
 egs/voxceleb/{v1 => v1.1}/local/make_musan.sh |   0
 .../{v1 => v1.1}/local/make_rirs_data.sh      |   0
 .../{v1 => v1.1}/local/make_some_figs.py      |   0
 .../make_train_lists_sup_embed_with_augm.sh   |   0
 .../{v1 => v1.1}/local/make_trials_subset.py  |   0
 .../{v1 => v1.1}/local/make_vox2_trials.py    |   0
 .../{v1 => v1.1}/local/make_voxceleb1_o.pl    |   0
 .../{v1 => v1.1}/local/make_voxceleb1_oeh.pl  |   0
 .../{v1 => v1.1}/local/make_voxceleb1_old.pl  |   0
 .../{v1 => v1.1}/local/make_voxceleb1_orig.pl |   0
 .../local/make_voxceleb1_orig_v2.pl           |   0
 .../{v1 => v1.1}/local/make_voxceleb1_v2.pl   |   0
 .../{v1 => v1.1}/local/make_voxceleb1_v2_o.pl |   0
 .../local/make_voxceleb1_v2_oeh.pl            |   0
 .../{v1 => v1.1}/local/make_voxceleb1cat.pl   |   0
 .../local/make_voxceleb1cat_v2.pl             |   0
 .../{v1 => v1.1}/local/make_voxceleb2.pl      |   0
 .../{v1 => v1.1}/local/make_voxceleb2cat.pl   |   0
 .../local/prepare_voxsrc22_dev.py             |   0
 .../local/prepare_voxsrc22_test.py            |   0
 egs/voxceleb/{v1 => v1.1}/local/score_dcf.py  |   0
 .../{v1 => v1.1}/local/score_voxceleb1.sh     |   0
 .../local/score_voxceleb1_o_clean.sh          |   0
 .../local/score_voxceleb1_single_cond.sh      |   0
 .../{v1 => v1.1}/local/score_voxsrc22_dev.sh  |   0
 egs/voxceleb/v1.1/run_002_compute_evad.sh     |   1 -
 egs/voxceleb/v1.2/hyp_utils                   |   1 +
 ...aseplus_ecapatdnn512x3_phase1_default.yaml |   6 -
 ...aseplus_ecapatdnn512x3_phase2_default.yaml |  12 -
 ...aseplus_ecapatdnn512x3_phase3_default.yaml |  11 -
 ...lmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml |  24 --
 ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh |  55 ----
 egs/voxceleb/v2/local                         |   2 +-
 egs/voxceleb/v2/run_001_prepare_data.sh       |  20 +-
 egs/voxceleb/v2/run_002_compute_evad.sh       |  63 ++--
 .../v2/run_003_prepare_noises_rirs.sh         |  67 ++++
 hyp_utils/conda_env.sh                        |   2 +-
 hyp_utils/create_data_split_dirs.sh           |   3 +-
 hyperion/bin/hyperion_dataset.py              |  93 ++++++
 hyperion/bin/hyperion_tables.py               | 129 ++++++++
 hyperion/bin/train_xvector_from_wav.py        |  10 +-
 hyperion/data_prep/data_prep.py               |   1 -
 hyperion/data_prep/voxceleb1.py               |   7 +-
 hyperion/data_prep/voxceleb2.py               |  11 +-
 hyperion/data_prep/voxsrc22.py                | 212 ++++++++++++
 hyperion/torch/trainers/torch_trainer.py      | 178 +++++-----
 hyperion/torch/trainers/xvector_trainer.py    |   8 +-
 .../trainers/xvector_trainer_from_wav.py      |  12 +-
 hyperion/utils/class_info.py                  |  27 +-
 hyperion/utils/dataset.py                     | 306 ++++++++++++++----
 hyperion/utils/enrollment_map.py              |  17 +-
 hyperion/utils/info_table.py                  |   7 +-
 63 files changed, 1024 insertions(+), 347 deletions(-)
 delete mode 120000 egs/voxceleb/v1.1/local
 rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/calibrate_voxceleb1_o_clean.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_musan.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_musan.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_rirs_data.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_some_figs.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_train_lists_sup_embed_with_augm.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_trials_subset.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_vox2_trials.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_o.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_oeh.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_old.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig_v2.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_o.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_oeh.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat_v2.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2cat.pl (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_dev.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_test.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/score_dcf.py (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_o_clean.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_single_cond.sh (100%)
 rename egs/voxceleb/{v1 => v1.1}/local/score_voxsrc22_dev.sh (100%)
 create mode 120000 egs/voxceleb/v1.2/hyp_utils
 delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml
 delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml
 delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml
 delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh
 create mode 100755 egs/voxceleb/v2/run_003_prepare_noises_rirs.sh
 create mode 100644 hyperion/bin/hyperion_dataset.py
 create mode 100755 hyperion/bin/hyperion_tables.py
 create mode 100644 hyperion/data_prep/voxsrc22.py

diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md
index 0f5d09ad..d90dc0a4 100644
--- a/egs/sre21-av-a/v1.16k/README.md
+++ b/egs/sre21-av-a/v1.16k/README.md
@@ -7,6 +7,20 @@ The systems runs at 16 kHz, telephone data is upsampled to 16k using SoX
 
    This recipe is based on these works
 ```
+@inproceedings{Villalba2022,
+author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak},
+city = {ISCA},
+doi = {10.21437/Odyssey.2022-30},
+issue = {July},
+journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)},
+month = {6},
+pages = {213-220},
+publisher = {ISCA},
+title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21},
+url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html},
+year = {2022},
+}
+				 
 @inproceedings{Villalba2020,
 address = {Tokyo, Japan},
 author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim},
@@ -139,14 +153,6 @@ The back-end used for these results is:
 | config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | 
 | config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 |
 
-## SRE-CTS Superset dev set
-
-| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
-| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
-| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | 
-| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | 
-| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 |
-
 ## SRE21 Audio Dev (official scoring tool)
 
 | Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh
index a5bc03eb..e56906f6 100755
--- a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh
+++ b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh
@@ -18,7 +18,7 @@ echo "Score SRE21 ${track} ${subset} for $score_dir"
 
 soft_dir=./sre21/scoring_software
 
-if [ ! -f $s_dir/sre_scorer.py ];then
+if [ ! -f $soft_dir/sre_scorer.py ];then
     echo "downloading scoring tool"
     local/download_sre21_scoring_tool.sh
 fi
diff --git a/egs/sre21-av-a/v1.8k/README.md b/egs/sre21-av-a/v1.8k/README.md
index a105128c..b55f9bf0 100644
--- a/egs/sre21-av-a/v1.8k/README.md
+++ b/egs/sre21-av-a/v1.8k/README.md
@@ -10,6 +10,20 @@ copy the utt2est_lang files from the 16k data dirs to the VoxCeleb and SRE21 dat
 
    This recipe is based on these works
 ```
+@inproceedings{Villalba2022,
+author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak},
+city = {ISCA},
+doi = {10.21437/Odyssey.2022-30},
+issue = {July},
+journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)},
+month = {6},
+pages = {213-220},
+publisher = {ISCA},
+title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21},
+url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html},
+year = {2022},
+}
+
 @inproceedings{Villalba2020,
 address = {Tokyo, Japan},
 author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim},
@@ -91,8 +105,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs
 
    - `run_011_train_xvector.sh`
       - Trains the x-vector network on 4sec chunks
-
-   - `run_012_finetune_xvector.sh`
       - Fine-tune x-vector network on 10-15 secs utts
 
    - `run_030_extract_xvectors.sh`
@@ -111,4 +123,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs
 
 ## Results
 
-TODO
+The back-end used for these results is:
+- back-end V2 (run_041_eval_be_v2.sh)
+- Without S-Norm
+- Scores are calibrated as indicated in the paper.
+
+## SRE16 Eval40% YUE
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.922   | 0.154 | 0.200 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.168 | 0.127 | 0.134 | 
+
+
+## SRE-CTS Superset dev set
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 1.39 | 0.072 | 0.095 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 1.175 | 0.057 | 0.069 |
+
+
+## SRE21 Audio Dev (official scoring tool)
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 | 6.65 | 0.418 | 0.436 | 
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 3.73 | 0.319 | 0.325 |
+
+
+## SRE21 Audio Eval (official scoring tool)
+
+| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary |
+| ------ | ---------- | ------------- | ------ | ------------- | ------------- |
+| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs <br> AAM-Softmax margin=0.5 |  5.44  |  0.388 | 0.390 |
+| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs <br> AAM-Softmax margin=0.5 | 4.21 | 0.356 | 0.377 |
+
diff --git a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh
index a55761ae..92cbd887 100755
--- a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh
+++ b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh
@@ -153,7 +153,7 @@ fi
 if [ $stage -le 4 ];then
   local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir
   local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1
-  local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1
+  local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1
diff --git a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh
index f8eae0a1..6890eba9 100755
--- a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh
+++ b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh
@@ -187,7 +187,7 @@ fi
 if [ $stage -le 4 ];then
   local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir
   local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1
-  local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1
+  local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1
@@ -311,7 +311,7 @@ fi
 if [ $stage -le 7 ];then
   local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir
   local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1
-  local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1
+  local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1
diff --git a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh
index 263d7bbe..35afbb27 100755
--- a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh
+++ b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh
@@ -185,7 +185,7 @@ fi
 if [ $stage -le 4 ];then
   local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir
   local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1
-  local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1
+  local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1
   local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1
diff --git a/egs/voxceleb/v1.1/local b/egs/voxceleb/v1.1/local
deleted file mode 120000
index 740b697d..00000000
--- a/egs/voxceleb/v1.1/local
+++ /dev/null
@@ -1 +0,0 @@
-../v1/local/
\ No newline at end of file
diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1.1/local/attack_analysis.py
similarity index 100%
rename from egs/voxceleb/v1/local/attack_analysis.py
rename to egs/voxceleb/v1.1/local/attack_analysis.py
diff --git a/egs/voxceleb/v1/local/attack_analysis.sh b/egs/voxceleb/v1.1/local/attack_analysis.sh
similarity index 100%
rename from egs/voxceleb/v1/local/attack_analysis.sh
rename to egs/voxceleb/v1.1/local/attack_analysis.sh
diff --git a/egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh
similarity index 100%
rename from egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh
rename to egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh
diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1.1/local/make_musan.py
similarity index 100%
rename from egs/voxceleb/v1/local/make_musan.py
rename to egs/voxceleb/v1.1/local/make_musan.py
diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1.1/local/make_musan.sh
similarity index 100%
rename from egs/voxceleb/v1/local/make_musan.sh
rename to egs/voxceleb/v1.1/local/make_musan.sh
diff --git a/egs/voxceleb/v1/local/make_rirs_data.sh b/egs/voxceleb/v1.1/local/make_rirs_data.sh
similarity index 100%
rename from egs/voxceleb/v1/local/make_rirs_data.sh
rename to egs/voxceleb/v1.1/local/make_rirs_data.sh
diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1.1/local/make_some_figs.py
similarity index 100%
rename from egs/voxceleb/v1/local/make_some_figs.py
rename to egs/voxceleb/v1.1/local/make_some_figs.py
diff --git a/egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh b/egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh
similarity index 100%
rename from egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh
rename to egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh
diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/v1.1/local/make_trials_subset.py
similarity index 100%
rename from egs/voxceleb/v1/local/make_trials_subset.py
rename to egs/voxceleb/v1.1/local/make_trials_subset.py
diff --git a/egs/voxceleb/v1/local/make_vox2_trials.py b/egs/voxceleb/v1.1/local/make_vox2_trials.py
similarity index 100%
rename from egs/voxceleb/v1/local/make_vox2_trials.py
rename to egs/voxceleb/v1.1/local/make_vox2_trials.py
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_o.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_o.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_oeh.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_old.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_old.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_old.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_old.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_orig.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_v2.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1cat.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1cat.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1.1/local/make_voxceleb2.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb2.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb2.pl
diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl
similarity index 100%
rename from egs/voxceleb/v1/local/make_voxceleb2cat.pl
rename to egs/voxceleb/v1.1/local/make_voxceleb2cat.pl
diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py
similarity index 100%
rename from egs/voxceleb/v1/local/prepare_voxsrc22_dev.py
rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py
diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py
similarity index 100%
rename from egs/voxceleb/v1/local/prepare_voxsrc22_test.py
rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py
diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1.1/local/score_dcf.py
similarity index 100%
rename from egs/voxceleb/v1/local/score_dcf.py
rename to egs/voxceleb/v1.1/local/score_dcf.py
diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1.1/local/score_voxceleb1.sh
similarity index 100%
rename from egs/voxceleb/v1/local/score_voxceleb1.sh
rename to egs/voxceleb/v1.1/local/score_voxceleb1.sh
diff --git a/egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh
similarity index 100%
rename from egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh
rename to egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh
diff --git a/egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh
similarity index 100%
rename from egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh
rename to egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh
diff --git a/egs/voxceleb/v1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh
similarity index 100%
rename from egs/voxceleb/v1/local/score_voxsrc22_dev.sh
rename to egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh
diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh
index 4e82a87a..27260be3 100755
--- a/egs/voxceleb/v1.1/run_002_compute_evad.sh
+++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh
@@ -24,7 +24,6 @@ if [ $stage -le 1 ]; then
     dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage
     if [ "$nodes" == "b0" ];then
       utils/create_split_dir.pl \
-	utils/create_split_dir.pl \
 	/export/b{04,05,06,07}/$dir_name $vaddir/storage
     elif [ "$nodes" == "b1" ];then
       utils/create_split_dir.pl \
diff --git a/egs/voxceleb/v1.2/hyp_utils b/egs/voxceleb/v1.2/hyp_utils
new file mode 120000
index 00000000..f6d1eb7a
--- /dev/null
+++ b/egs/voxceleb/v1.2/hyp_utils
@@ -0,0 +1 @@
+../../../hyp_utils
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml
deleted file mode 100644
index 8574a1cf..00000000
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-data:
-  train: train_data_default.yaml
-  val: val_data_default.yaml
-model: wavlmbaseplus_ecapatdnn512x3.yaml
-trainer: trainer_phase1_sgd_default.yaml
- 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml
deleted file mode 100644
index 87b01a1f..00000000
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-data:
-  train: train_data_default.yaml
-  val: val_data_default.yaml
-model:
-  xvector:
-    cos_scale: 32.0
-    margin: 0.2
-    margin_warmup_epochs: 0
-    intertop_k: 5
-    intertop_margin: 0.1
-trainer: trainer_phase2_sgd_default.yaml
- 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml
deleted file mode 100644
index d13931e0..00000000
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-data:
-  train: train_data_default.yaml
-  val: val_data_default.yaml
-model:
-  xvector:
-    cos_scale: 32.0
-    margin: 0.4
-    margin_warmup_epochs: 0
-    intertop_margin: 0.
-trainer: trainer_phase3_sgd_default.yaml
- 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml
index 34c6e8dc..d4db70a7 100644
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml
@@ -41,29 +41,6 @@ data:
       num_hard_prototypes: 0
     data_loader:
       num_workers: 8
-
-train:
-    dataset:
-      max_chunk_length: 3.0
-      min_chunk_length: 3.0
-      aug_cfg: conf/reverb_noise_aug.yaml
-      wav_scale: 1
-    sampler:
-      batch_size: 32
-      iters_per_epoch: 6
-    data_loader:
-      num_workers: 8
-  val:
-    dataset:
-      max_chunk_length: 4.0
-      min_chunk_length: 4.0
-      aug_cfg: conf/reverb_noise_aug.yaml
-      wav_scale: 1
-    sampler:
-      batch_size: 32
-      iters_per_epoch: 6
-    data_loader:
-      num_workers: 8
 model: wavlmbaseplus_ecapatdnn512x3.yaml
 trainer:
   optim:
@@ -84,5 +61,4 @@ trainer:
   epochs: 60
   eff_batch_size: 1024
   train_mode: hf-feats-frozen-nograd
-
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh
deleted file mode 100644
index 942fb336..00000000
--- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2
-
-# hugging face model
-hf_model_name=wav2vec2base
-
-#vad
-vad_config=conf/vad_16k.yaml
-
-# x-vector training 
-nnet_data=voxceleb2cat_train
-
-# x-vector cfg
-
-nnet_type=hf_wav2vec2resnet1d
-
-batch_size_1gpu=32
-eff_batch_size=512 # effective batch size
-dropout=0
-embed_dim=256
-lr=0.05
-s=30
-margin_warmup=20
-margin=0.3
-nnet_num_epochs=70
-
-
-lr=0.001
-#lr=0.005
-xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml
-xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2"
-
-nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1
-
-nnet_dir=exp/xvector_nnets/$nnet_name
-nnet=$nnet_dir/model_ep0060.pth
-nnet=$nnet_dir/swa_model_ep0076.pth
-nnet=$nnet_dir/model_ep0060.pth
-nnet=$nnet_dir/model_ep0030.pth
-nnet=$nnet_dir/model_ep0040.pth
-nnet=$nnet_dir/model_ep0020.pth
-
-
-# back-end
-plda_aug_config=conf/reverb_noise_aug.yaml
-plda_num_augs=6
-if [ $plda_num_augs -eq 0 ]; then
-    plda_data=voxceleb2cat_train
-else
-    plda_data=voxceleb2cat_train_augx${plda_num_augs}
-fi
-plda_type=splda
-lda_dim=200
-plda_y_dim=150
-plda_z_dim=200
-
diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local
index 740b697d..2ac14857 120000
--- a/egs/voxceleb/v2/local
+++ b/egs/voxceleb/v2/local
@@ -1 +1 @@
-../v1/local/
\ No newline at end of file
+../v1.1/local
\ No newline at end of file
diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh
index 7bf15448..44385610 100755
--- a/egs/voxceleb/v2/run_001_prepare_data.sh
+++ b/egs/voxceleb/v2/run_001_prepare_data.sh
@@ -12,7 +12,7 @@ config_file=default_config.sh
 
 . parse_options.sh || exit 1;
 . datapath.sh 
-
+. $config_file
 
 if [ $stage -le 1 ];then
   # Prepare the VoxCeleb2 dataset for training.
@@ -26,3 +26,21 @@ if [ $stage -le 2 ];then
   # Use this for the newer version of voxceleb1:
   local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
 fi
+
+if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
+  local/prepare_voxsrc22_dev.py \
+    --vox1-corpus-dir $voxceleb1_root \
+    --voxsrc22-corpus-dir $voxsrc22_root \
+    --output-dir data/voxsrc22_dev
+fi
+
+# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
+#   local/prepare_voxsrc22_test.py \
+#     --corpus-dir $voxsrc22_root \
+#     --output-dir data/voxsrc22_test
+# fi
+
+if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then
+  # # split vox2 into 2 parts, for cohort and qmf training
+  local/make_vox2_trials.py --data-dir data/voxceleb2cat_train
+fi
diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh
index eeae00ac..1248ad39 100755
--- a/egs/voxceleb/v2/run_002_compute_evad.sh
+++ b/egs/voxceleb/v2/run_002_compute_evad.sh
@@ -19,39 +19,40 @@ config_file=default_config.sh
 
 
 if [ $stage -le 1 ]; then
-    # Prepare to distribute data over multiple machines
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then
-	dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage
-	if [ "$nodes" == "b0" ];then
-	    utils/create_split_dir.pl \
-			    utils/create_split_dir.pl \
-		/export/b{04,05,06,07}/$dir_name $vaddir/storage
-	elif [ "$nodes" == "b1" ];then
-	    utils/create_split_dir.pl \
-		/export/b{14,15,16,17}/$dir_name $vaddir/storage
-	elif [ "$nodes" == "c0" ];then
-	    utils/create_split_dir.pl \
-		/export/c{06,07,08,09}/$dir_name $vaddir/storage
-	elif [ "$nodes" == "fs01" ];then
-	    utils/create_split_dir.pl \
-		/export/fs01/$dir_name $vaddir/storage
-	else
-	    echo "we don't distribute data between multiple machines"
-	fi
+  # Prepare to distribute data over multiple machines
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then
+    dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage
+    if [ "$nodes" == "b0" ];then
+      utils/create_split_dir.pl \
+	/export/b{04,05,06,07}/$dir_name $vaddir/storage
+    elif [ "$nodes" == "b1" ];then
+      utils/create_split_dir.pl \
+	/export/b{14,15,16,17}/$dir_name $vaddir/storage
+    elif [ "$nodes" == "c0" ];then
+      utils/create_split_dir.pl \
+	/export/c{06,07,08,09}/$dir_name $vaddir/storage
+    elif [ "$nodes" == "fs01" ];then
+      utils/create_split_dir.pl \
+	/export/fs01/$dir_name $vaddir/storage
+    else
+      echo "we don't distribute data between multiple machines"
     fi
+  fi
 fi
 
-#Train datasets
-if [ $stage -le 2 ];then 
-    for name in voxceleb2cat_train voxceleb1_test
-    do
-	num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
-	nj=$(($num_spk < 40 ? $num_spk:40))
-	hyp_utils/feats/make_evad.sh --write-utt2num-frames true \
-	    --vad-config $vad_config --nj $nj --cmd "$train_cmd" \
-	    data/${name} exp/make_vad/$name $vaddir
-	utils/fix_data_dir.sh data/${name}
-    done
+if [ $stage -le 2 ];then
+  if [ "$do_voxsrc22" == "true" ];then
+    extra_data="voxsrc22_dev"
+  fi
+  for name in voxceleb2cat_train voxceleb1_test $extra_data
+  do
+    num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
+    nj=$(($num_spk < 40 ? $num_spk:40))
+    hyp_utils/feats/make_evad.sh \
+      --write-utt2num-frames true \
+      --vad-config $vad_config --nj $nj --cmd "$train_cmd" \
+      data/${name} exp/make_vad/$name $vaddir
+    utils/fix_data_dir.sh data/${name}
+  done
 fi
 
-
diff --git a/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh
new file mode 100755
index 00000000..a448af9a
--- /dev/null
+++ b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+config_file=default_config.sh
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+# We prepare the noise files and RIR for online speech augmentation
+
+if [ $stage -le 1 ]; then
+
+    # Prepare the MUSAN corpus, which consists of music, speech, and noise
+    # suitable for augmentation.
+    local/make_musan.sh $musan_root 16 data
+    
+    for name in musan_noise musan_music
+    do
+	steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \
+	    --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \
+	    data/${name} data/${name}_proc_audio exp/${name}_proc_audio
+	utils/fix_data_dir.sh data/${name}_proc_audio
+    done
+
+fi
+
+if [ $stage -le 2 ]; then
+
+    # Create Babble noise from MUSAN speech files
+    for name in musan_speech
+    do
+	steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \
+	    --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \
+	    data/${name} data/${name}_babble exp/${name}_babble
+	# utils/fix_data_dir.sh data/${name}_babble
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    if [ ! -d "RIRS_NOISES" ]; then
+	if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then
+	    ln -s ../../sre19-cmn2/v1/RIRS_NOISES
+	else
+	    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+	    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+	    unzip rirs_noises.zip
+	fi
+    fi
+    local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom
+    local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom
+    local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real
+    for rirs in rirs_smallroom rirs_mediumroom rirs_real
+    do
+	#pack all rirs in h5 files
+	steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs
+    done
+    
+fi
+
+
diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh
index ceee4e93..8d5c67c1 100755
--- a/hyp_utils/conda_env.sh
+++ b/hyp_utils/conda_env.sh
@@ -79,7 +79,7 @@ if [ $num_gpus -gt 0 ];then
   #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters
   if [ $num_gpus -gt 1 ];then
     
-    [[ $(type -P "$torchrun") ]] && command="torchrun" \
+    [[ $(type -P "torchrun") ]] && command="torchrun" \
 	|| command="python -m torch.distributed.run"
     command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1"
   fi
diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh
index 877b9e3f..06c30779 100755
--- a/hyp_utils/create_data_split_dirs.sh
+++ b/hyp_utils/create_data_split_dirs.sh
@@ -25,8 +25,7 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then
   echo "Prepare to distribute data over multiple $nodes nodes"
   dir_name=$storage_dir/$storage_name/storage
   if [ "$nodes" == "b0" ];then
-    utils/create_split_dir.pl \
-      hyp_utils/create_split_dir.pl \
+    hyp_utils/create_split_dir.pl \
       /export/b{04,05,06,07}/$dir_name $link_dir
   elif [ "$nodes" == "b1" ];then
     hyp_utils/create_split_dir.pl \
diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py
new file mode 100644
index 00000000..9e7bac5c
--- /dev/null
+++ b/hyperion/bin/hyperion_dataset.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Optional, Union, List
+from pathlib import Path
+
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import (
+    PathLike,
+    Dataset,
+    InfoTable,
+    RecordingSet,
+    FeatureSet,
+    ClassInfo,
+    EnrollmentMap,
+    SegmentSet,
+)
+
+subcommands = ["add_features"]
+# table_dict = {
+#     "segments": SegmentSet,
+#     "recordings": RecordingSet,
+#     "features": FeatureSet,
+#     "classes": ClassInfo,
+#     "enrollments": EnrollmentMap,
+#     "generic": InfoTable,
+# }
+
+
+def add_common_args(parser):
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default=1,
+        choices=[0, 1, 2, 3],
+        type=int,
+    )
+
+
+def make_add_features_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--features-name", required=True, help="""name of the feature"""
+    )
+    parser.add_argument("--features-file", required=True, help="""feature set file""")
+
+    add_common_args(parser)
+    return parser
+
+
+def add_features(
+    dataset: PathLike,
+    features_name: str,
+    features_file: PathLike,
+):
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.add_features(features_name, features_file)
+    dataset.save(dataset)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+    for subcommand in subcommands:
+        parser_func = f"make_{subcommand}_parser"
+        subparser = globals()[parser_func]()
+        subcommands.add_subcommand(k, subparser)
+
+    args = parser.parse_args()
+    subcommand = args.subcommand
+    kwargs = namespace_to_dict(args)[args.subcommand]
+    config_logger(kwargs["verbose"])
+    del kwargs["verbose"]
+
+    globals()[subcommand](**kwargs)
diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py
new file mode 100755
index 00000000..a79a1dca
--- /dev/null
+++ b/hyperion/bin/hyperion_tables.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Optional, Union, List
+from pathlib import Path
+
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import (
+    PathLike,
+    InfoTable,
+    RecordingSet,
+    FeatureSet,
+    ClassInfo,
+    EnrollmentMap,
+    SegmentSet,
+)
+
+subcommands = ["cat"]
+table_dict = {
+    "segments": SegmentSet,
+    "recordings": RecordingSet,
+    "features": FeatureSet,
+    "classes": ClassInfo,
+    "enrollments": EnrollmentMap,
+    "generic": InfoTable,
+}
+
+
+def add_common_args(parser):
+    parser.add_argument(
+        "--table-type",
+        default="generic",
+        choices=list(table_dict.keys()),
+        help=f"Type of table in {list(table_dict.keys())}",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default=1,
+        choices=[0, 1, 2, 3],
+        type=int,
+    )
+
+
+def make_cat_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--input-files", default=None, nargs="+", help="optional list of input files"
+    )
+    parser.add_argument(
+        "--output-file",
+        required=True,
+        help="""output file, if input-files is None, input files names are derived from it""",
+    )
+    parser.add_argument(
+        "--num-tables",
+        default=0,
+        type=int,
+        help="""number of jobs we used to create the individual tables""",
+    )
+    parser.add_argument(
+        "--base-idx",
+        default=1,
+        type=int,
+        help="""index of the first job, typically 0 or 1""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def cat(
+    table_type: str,
+    input_files: Union[List[PathLike], None],
+    output_file: PathLike,
+    num_table: int,
+    base_idx: int = 1,
+):
+
+    assert input_files is not None or num_jobs != 0
+    output_file = Path(output_file)
+    if input_files is None:
+        ext = output_file.suffix
+        input_file_base = output_file.with_suffix("")
+        input_files = []
+        for i in range(num_tables):
+            idx = base_idx + i
+            input_file_i = input_file_base.with_suffix(f".{idx}{ext}")
+            input_files.append(input_file_i)
+
+    table_class = table_dict[table_type]
+    tables = []
+    for file_path in input_files:
+        tables.append(table_class.load(file_path))
+
+    output_table = table_class.cat(tables)
+    output_table.save(output_file)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+    for subcommand in subcommands:
+        parser_func = f"make_{subcommand}_parser"
+        subparser = globals()[parser_func]()
+        subcommands.add_subcommand(k, subparser)
+
+    args = parser.parse_args()
+    subcommand = args.subcommand
+    kwargs = namespace_to_dict(args)[args.subcommand]
+    config_logger(kwargs["verbose"])
+    del kwargs["verbose"]
+
+    globals()[subcommand](**kwargs)
diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py
index 5c999dd1..a210d429 100755
--- a/hyperion/bin/train_xvector_from_wav.py
+++ b/hyperion/bin/train_xvector_from_wav.py
@@ -6,12 +6,14 @@
 import logging
 import multiprocessing
 import os
-import sys
-import time
 from pathlib import Path
 
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 import torch
 from hyperion.hyp_defs import config_logger, set_float_cpu
diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py
index 19420761..d9828674 100644
--- a/hyperion/data_prep/data_prep.py
+++ b/hyperion/data_prep/data_prep.py
@@ -69,7 +69,6 @@ def get_recording_duration(self, recording_set):
         import itertools
         from ..utils import SCPList
 
-        # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values)
         futures = []
         logging.info("submitting threats...")
         with ThreadPoolExecutor(max_workers=self.num_threads) as pool:
diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py
index 00b2e380..c23b64ff 100644
--- a/hyperion/data_prep/voxceleb1.py
+++ b/hyperion/data_prep/voxceleb1.py
@@ -214,7 +214,12 @@ def get_segmentid(s):
         return enrollments, trials
 
     def prepare(self):
-
+        logging.info(
+            "Peparing VoxCeleb1 for %s corpus_dir:%s -> data_dir:%s",
+            self.task,
+            self.corpus_dir,
+            self.output_dir,
+        )
         logging.info("getting audio meta-data")
         df_meta = self._get_metadata()
         logging.info("getting language estimations")
diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py
index 1a32420f..bef34ec9 100644
--- a/hyperion/data_prep/voxceleb2.py
+++ b/hyperion/data_prep/voxceleb2.py
@@ -136,6 +136,12 @@ def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i):
         return file_path
 
     def prepare(self):
+        logging.info(
+            "Peparing VoxCeleb2 %s corpus_dir:%s -> data_dir:%s",
+            self.subset,
+            self.corpus_dir,
+            self.output_dir,
+        )
         logging.info("getting audio meta-data")
         df_meta = self._get_metadata()
         logging.info("getting language estimations")
@@ -224,11 +230,6 @@ def prepare(self):
                 "duration": recs.loc[rec_ids, "duration"].values,
             }
         )
-        # print(
-        #     recs.loc[rec_ids, "duration"],
-        #     len(segments),
-        #     len(recs.loc[rec_ids, "duration"]),
-        # )
         segments = SegmentSet(segments)
         segments.sort()
 
diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py
new file mode 100644
index 00000000..1999262a
--- /dev/null
+++ b/hyperion/data_prep/voxsrc22.py
@@ -0,0 +1,212 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import glob
+import re
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from jsonargparse import ActionYesNo
+from tqdm import tqdm
+
+from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet
+from ..utils.misc import PathLike, urlretrieve_progress
+from .data_prep import DataPrep
+
+
+class VoxSRC22DataPrep(DataPrep):
+    """Class to prepare VoxSRC22 dev/test data
+    Attributes:
+      corpus_dir: input data directory
+      vox1_corpus_dir: input data directory for VoxCeleb1
+      subset: subset of the data dev or test
+      output_dir: output data directory
+      target_sample_freq: target sampling frequency to convert the audios to.
+    """
+
+    def __init__(
+        self,
+        corpus_dir: PathLike,
+        vox1_corpus_dir: PathLike,
+        subset: str,
+        output_dir: PathLike,
+        use_kaldi_ids: bool,
+        target_sample_freq: int,
+        num_threads: int = 10,
+    ):
+        use_kaldi_ids = False
+        super().__init__(
+            corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads
+        )
+
+        assert (
+            vox1_corpus_dir is not None or subset == "test"
+        ), "dev set needs the VoxCeleb1 corpus dir"
+        self.subset = subset
+        self.vox1_corpus_dir = (
+            None if vox1_corpus_dir is None else Path(vox1_corpus_dir)
+        )
+
+    @staticmethod
+    def dataset_name():
+        return "voxceleb2"
+
+    @staticmethod
+    def add_class_args(parser):
+        DataPrep.add_class_args(parser)
+        parser.add_argument(
+            "--subset",
+            default="dev",
+            choices=["dev", "test"],
+            help="""vox2 subset in [dev, test]""",
+        )
+        parser.add_argument(
+            "--vox1-corpus-dir",
+            default=None,
+            help="""corpus directory of voxceleb 1.""",
+        )
+
+    def prepare_track12_dev(self):
+        logging.info(
+            "Preparing VoxSRC22 %s corpus:%s + %s -> %s",
+            self.subset,
+            self.corpus_dir,
+            self.vox1_corpus_dir,
+            self.output_dir,
+        )
+        logging.info("making trials")
+        trials_file = self.corpus_dir / "voxsrc2022_dev.txt"
+        df_in = pd.read_csv(
+            trials_file,
+            header=None,
+            sep=" ",
+            names=["key", "enroll_file", "test_file"],
+        )
+        key = ["target" if k == 1 else "nontarget" for k in df_in["key"]]
+
+        modelid = df_in["enroll_file"]
+        segmentid = df_in["test_file"]
+        df_trials = pd.DataFrame(
+            {"modelid": modelid, "segmentid": segmentid, "targettype": key}
+        )
+        df_trials.sort_values(by=["modelid", "segmentid"], inplace=True)
+        file_path = self.output_dir / "trials.csv"
+        df_trials.to_csv(file_path, index=False)
+        trials = {"trials": file_path}
+        modelid = df_trials["modelid"].sort_values().unique()
+        uniq_segmentid = df_trials["segmentid"].sort_values().unique()
+        uniq_segmentid = np.unique(np.concatenate((uniq_segmentid, modelid), axis=0))
+
+        logging.info("making enrollment map")
+        df_enroll = pd.DataFrame({"modelid": modelid, "segmentid": modelid})
+        file_path = self.output_dir / "enrollment.csv"
+        df_enroll.to_csv(file_path, index=False)
+        enrollments = {"enrollment": file_path}
+
+        logging.info("making RecordingSet")
+        vox1_segmentid = []
+        vox22_segmentid = []
+        for s in uniq_segmentid:
+            if "VoxSRC2022_dev" in s:
+                vox22_segmentid.append(s)
+            else:
+                vox1_segmentid.append(s)
+
+        vox1_rec_files = [
+            glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid
+        ]
+        vox22_rec_files = [
+            glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid
+        ]
+        rec_ids = vox22_segmentid + vox1_segmentid
+        rec_files = vox22_rec_files + vox1_rec_files
+
+        recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files})
+        recs = RecordingSet(recs)
+        recs.sort()
+
+        logging.info("getting recording durations")
+        self.get_recording_duration(recs)
+        if self.target_sample_freq:
+            recs["target_sample_freq"] = self.target_sample_freq
+
+        logging.info("making SegmentsSet")
+        segments = pd.DataFrame({"id": rec_ids,})
+        segments = SegmentSet(segments)
+        segments.sort()
+
+        logging.info("making dataset")
+        dataset = Dataset(
+            segments,
+            recordings={"recordings": recs},
+            enrollments=enrollments,
+            trials=trials,
+            sparse_trials=False,
+        )
+        logging.info("saving dataset at %s", self.output_dir)
+        dataset.save(self.output_dir)
+        logging.info(
+            "datasets containts %d segments", len(segments),
+        )
+
+    #             wav_file = voxsrc22_corpus_dir / file_id
+    #                             wav_file = vox1_corpus_dir / "wav" / file_id
+    #     logging.info("searching audio files in %s", self.vox1_corpus_dir)
+    #     vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav"))
+    #     if not vox1_rec_files:
+    #         # symlinks? try glob
+    #         vox1_rec_files = [
+    #             Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True)
+    #         ]
+
+    #     vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files]
+    #     rec_files =
+
+    #     rec_files = list(self.corpus_dir.glob("**/*.wav"))
+    #     if not rec_files:
+    #         # symlinks? try glob
+    #         rec_files = [
+    #             Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True)
+    #         ]
+
+    # u2s_file = output_dir / "utt2spk"
+    # logging.info("creating utt2spk file %s", u2s_file)
+    # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"])))
+    # with open(u2s_file, "w") as f:
+    #     for file_id in file_ids:
+    #         f.write("%s %s\n" % (file_id, file_id))
+
+    # s2u_file = output_dir / "spk2utt"
+    # logging.info("creating spk2utt file %s", s2u_file)
+    # with open(s2u_file, "w") as f:
+    #     for file_id in file_ids:
+    #         f.write("%s %s\n" % (file_id, file_id))
+
+    # wav_file = output_dir / "wav.scp"
+    # logging.info("creating wav.scp file %s", wav_file)
+    # with open(wav_file, "w") as f:
+    #     for file_id in file_ids:
+    #         if "VoxSRC2022_dev" in file_id:
+    #             wav_file = voxsrc22_corpus_dir / file_id
+    #         else:
+    #             wav_file = vox1_corpus_dir / "wav" / file_id
+
+    #         f.write("%s %s\n" % (file_id, wav_file))
+
+    def prepare_track12_test(self):
+        logging.info(
+            "Preparing VoxSRC22 %s corpus:%s -> %s",
+            self.subset,
+            self.corpus_dir,
+            self.output_dir,
+        )
+
+    def prepare(self):
+        if self.subset == "dev":
+            self.prepare_track12_dev()
+        else:
+            self.prepare_track12_test()
diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py
index a6f20a8e..c8565d1d 100644
--- a/hyperion/torch/trainers/torch_trainer.py
+++ b/hyperion/torch/trainers/torch_trainer.py
@@ -21,13 +21,17 @@
 from torch.optim.swa_utils import SWALR, AveragedModel
 
 from ...utils.misc import filter_func_args
-from ..loggers import (CSVLogger, LoggerList, ProgLogger, TensorBoardLogger,
-                       WAndBLogger)
+from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger
 from ..lr_schedulers import LRScheduler as LRS
 from ..lr_schedulers import LRSchedulerFactory as LRSF
 from ..optim import OptimizerFactory as OF
-from ..utils import (FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP,
-                     tensors_subset)
+from ..utils import (
+    FairFullyShardedDDP,
+    FairShardedDDP,
+    MetricAcc,
+    TorchDDP,
+    tensors_subset,
+)
 
 
 class DDPType(str, Enum):
@@ -72,6 +76,7 @@ class TorchTrainer(object):
       input_key: dict. key for nnet input.
       target_key: dict. key for nnet targets.
     """
+
     def __init__(
         self,
         model,
@@ -113,8 +118,9 @@ def __init__(
         self.exp_path = Path(exp_path)
 
         if loggers is None:
-            self.loggers = self._default_loggers(log_interval, use_tensorboard,
-                                                 use_wandb, wandb)
+            self.loggers = self._default_loggers(
+                log_interval, use_tensorboard, use_wandb, wandb
+            )
         elif isinstance(loggers, list):
             self.loggers = LoggerList(loggers)
         else:
@@ -149,29 +155,23 @@ def __init__(
             self.rank = dist.get_rank()
             self.world_size = dist.get_world_size()
             if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP:
-                self.model = nn.SyncBatchNorm.convert_sync_batchnorm(
-                    self.model)
+                self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
                 if self.rank == 0:
                     logging.info(
                         "training in multiple gpus with distributed-data-parallel"
                     )
                 oss = False if ddp_type == DDPType.DDP else True
-                self.optimizer = self._make_optimizer(optim,
-                                                      self.model,
-                                                      oss=oss)
+                self.optimizer = self._make_optimizer(optim, self.model, oss=oss)
                 self.model = TorchDDP(
                     self.model, device_ids=[device], output_device=device,
                 )
             elif ddp_type == DDPType.OSS_SHARDED_DDP:
-                self.model = nn.SyncBatchNorm.convert_sync_batchnorm(
-                    self.model)
+                self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
                 if self.rank == 0:
                     logging.info(
                         "training in multiple gpus with fair sharded-distributed-data-parallel"
                     )
-                self.optimizer = self._make_optimizer(optim,
-                                                      self.model,
-                                                      oss=True)
+                self.optimizer = self._make_optimizer(optim, self.model, oss=True)
                 self.model = FairShardedDDP(self.model, self.optimizer)
             else:
                 if self.rank == 0:
@@ -184,9 +184,7 @@ def __init__(
                     mixed_precision=self.use_amp,
                     move_params_to_cpu=cpu_offload,
                 )
-                self.optimizer = self._make_optimizer(optim,
-                                                      self.model,
-                                                      oss=False)
+                self.optimizer = self._make_optimizer(optim, self.model, oss=False)
 
         else:
             self.optimizer = self._make_optimizer(optim, self.model)
@@ -216,9 +214,9 @@ def __init__(
             if self.rank == 0:
                 logging.info("init SWA model")
             self.swa_model = AveragedModel(self.model)
-            self.swa_scheduler = SWALR(self.optimizer,
-                                       swa_lr=self.swa_lr,
-                                       anneal_epochs=self.swa_anneal_epochs)
+            self.swa_scheduler = SWALR(
+                self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs
+            )
 
     def set_epoch(self, data_loader):
         try:
@@ -252,8 +250,7 @@ def fit(self, train_data, val_data=None):
             if self.lr_scheduler is not None:
                 # this is needed by cosine scheduler
                 epoch_updates = int(len(train_data) / self.grad_acc_steps)
-                self.lr_scheduler.on_epoch_begin(epoch,
-                                                 epoch_updates=epoch_updates)
+                self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates)
 
             logs = self.train_epoch(train_data)
             if val_data is not None:
@@ -275,8 +272,7 @@ def fit(self, train_data, val_data=None):
             self.save_checkpoint(logs)
 
         if self.in_swa:
-            self.loggers.on_epoch_begin(self.cur_epoch,
-                                        batches=len(train_data))
+            self.loggers.on_epoch_begin(self.cur_epoch, batches=len(train_data))
             self.model = self.swa_model.module
             logs = self.bn_update_epoch(train_data)
 
@@ -351,16 +347,16 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
         with torch.no_grad():
             if swa_update_bn:
                 log_tag = "train_"
-                self.train()
+                self.model.train()
             else:
                 log_tag = "val_"
                 self.model.eval()
 
             for batch, data in enumerate(data_loader):
-                input_data, target = tensors_subset(data, batch_keys, self.device)
-                batch_size = input_data.size(0)
+                x, target = tensors_subset(data, batch_keys, self.device)
+                batch_size = x.size(0)
                 with amp.autocast(enabled=self.use_amp):
-                    output = self.model(input_data)
+                    output = self.model(x)
                     loss = self.loss(output, target)
 
                 batch_metrics["loss"] = loss.mean().item()
@@ -381,9 +377,9 @@ def bn_update_epoch(self, data_loader):
     def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm):
         if self.ddp:
             if self.ddp_type == DDPType.DDP:
-                nn.utils.clip_grad_norm_(model.parameters(),
-                                         grad_clip,
-                                         norm_type=grad_clip_norm)
+                nn.utils.clip_grad_norm_(
+                    model.parameters(), grad_clip, norm_type=grad_clip_norm
+                )
                 return
             if self.ddp_type == DDPType.FULLY_SHARDED_DDP:
                 # we have to use the member function in FullyShardedDDP class
@@ -395,24 +391,26 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm):
                 optim.clip_grad_norm(grad_clip, norm_type=grad_clip_norm)
 
         # if no DDP clip normally
-        nn.utils.clip_grad_norm_(model.parameters(),
-                                 grad_clip,
-                                 norm_type=grad_clip_norm)
+        nn.utils.clip_grad_norm_(
+            model.parameters(), grad_clip, norm_type=grad_clip_norm
+        )
 
     def update_model(self):
         """Updates the model and does gradding clipping."""
         if self.use_amp:
             if self.grad_clip > 0:
                 self.grad_scaler.unscale_(self.optimizer)
-                self._clip_grad_norm(self.model, self.optimizer,
-                                     self.grad_clip, self.grad_clip_norm)
+                self._clip_grad_norm(
+                    self.model, self.optimizer, self.grad_clip, self.grad_clip_norm
+                )
 
             self.grad_scaler.step(self.optimizer)
             self.grad_scaler.update()
         else:
             if self.grad_clip > 0:
-                self._clip_grad_norm(self.model, self.optimizer,
-                                     self.grad_clip, self.grad_clip_norm)
+                self._clip_grad_norm(
+                    self.model, self.optimizer, self.grad_clip, self.grad_clip_norm
+                )
 
             self.optimizer.step()
 
@@ -441,20 +439,21 @@ def _make_lr_sched(self, lr_sched, optim):
         lr_sched = LRSF.create(optim, **args)
         return lr_sched
 
-    def _default_loggers(self, log_interval, use_tensorboard, use_wandb,
-                         wandb):
+    def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb):
         """Creates the default data loaders"""
         prog_log = ProgLogger(interval=log_interval)
         csv_log = CSVLogger(self.exp_path / "train.log", append=True)
         loggers = [prog_log, csv_log]
         if use_tensorboard:
             loggers.append(
-                TensorBoardLogger(self.exp_path / "tb", interval=log_interval))
+                TensorBoardLogger(self.exp_path / "tb", interval=log_interval)
+            )
         if use_wandb:
             loggers.append(
-                WAndBLogger(**wandb,
-                            path=self.exp_path / "wandb",
-                            interval=log_interval))
+                WAndBLogger(
+                    **wandb, path=self.exp_path / "wandb", interval=log_interval
+                )
+            )
         return LoggerList(loggers)
 
     def _get_lr(self):
@@ -478,7 +477,8 @@ def _compute_grad_acc_steps(self, data_loader):
                 return
 
             self.grad_acc_steps = int(
-                math.ceil(self.eff_batch_size / batch_size / self.world_size))
+                math.ceil(self.eff_batch_size / batch_size / self.world_size)
+            )
             logging.info(
                 "Setting grad_acc_steps=%d for "
                 "eff_batch_size=%d, avg_batch_size=%d, world_size=%d",
@@ -502,30 +502,24 @@ def checkpoint(self, logs=None):
           logs: logs containing the current value of the metrics.
         """
         checkpoint = {
-            "epoch":
-            self.cur_epoch,
-            "rng_state":
-            torch.get_rng_state(),
-            "model_cfg":
-            self.model.get_config(),
-            "model_state_dict":
-            self.model.state_dict(),
-            "optimizer_state_dict":
-            self.optimizer.state_dict(),
-            "loss_state_dict":
-            self.loss.state_dict() if self.loss is not None else None,
+            "epoch": self.cur_epoch,
+            "rng_state": torch.get_rng_state(),
+            "model_cfg": self.model.get_config(),
+            "model_state_dict": self.model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "loss_state_dict": self.loss.state_dict()
+            if self.loss is not None
+            else None,
         }
         if self.lr_scheduler is not None:
-            checkpoint[
-                "lr_scheduler_state_dict"] = self.lr_scheduler.state_dict()
+            checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict()
 
         if logs is not None:
             checkpoint["logs"] = logs
 
         if self.in_swa:
             checkpoint["swa_model_state_dict"] = self.swa_model.state_dict()
-            checkpoint[
-                "swa_scheduler_state_dict"] = self.swa_scheduler.state_dict()
+            checkpoint["swa_scheduler_state_dict"] = self.swa_scheduler.state_dict()
 
         return checkpoint
 
@@ -535,8 +529,9 @@ def save_checkpoint(self, logs=None):
         Args:
           logs: logs containing the current value of the metrics.
         """
-        if self.ddp and (self.ddp_type == DDPType.OSS_DDP
-                         or self.ddp_type == DDPType.OSS_SHARDED_DDP):
+        if self.ddp and (
+            self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP
+        ):
             # Not sure what this does, just copying from the example in
             # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py
             # Check the checkpointing in the case of the OSS optimizer
@@ -591,17 +586,16 @@ def load_checkpoint(self, file_path):
         if self.loss is not None:
             self.loss.load_state_dict(checkpoint["loss_state_dict"])
         if self.lr_scheduler is not None:
-            self.lr_scheduler.load_state_dict(
-                checkpoint["lr_scheduler_state_dict"])
+            self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
 
         # if self.use_amp:
         #    amp.load_state_dict(checkpoint['amp'])
         if self.do_swa:
             if "swa_model_state_dict" in checkpoint:
-                self.swa_model.load_state_dict(
-                    checkpoint["swa_model_state_dict"])
+                self.swa_model.load_state_dict(checkpoint["swa_model_state_dict"])
                 self.swa_scheduler.load_state_dict(
-                    checkpoint["swa_scheduler_state_dict"])
+                    checkpoint["swa_scheduler_state_dict"]
+                )
             else:
                 self.swa_scheduler = SWALR(
                     self.optimizer,
@@ -681,13 +675,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             "--eff-batch-size",
             type=int,
             default=None,
-            help=
-            "effective total batch size, if given, it overrides grad_acc_steps",
+            help="effective total batch size, if given, it overrides grad_acc_steps",
         )
-        parser.add_argument("--epochs",
-                            type=int,
-                            default=200,
-                            help="number of epochs")
+        parser.add_argument("--epochs", type=int, default=200, help="number of epochs")
         if train_modes is not None:
             parser.add_argument(
                 "--train-mode",
@@ -707,19 +697,12 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             default=False,
             help="use tensorboard logger",
         )
-        parser.add_argument("--use-wandb",
-                            action="store_true",
-                            default=False,
-                            help="use wandb logger")
-        parser.add_argument("--wandb.project",
-                            default=None,
-                            help="wandb project name")
-        parser.add_argument("--wandb.group",
-                            default=None,
-                            help="wandb group name")
-        parser.add_argument("--wandb.name",
-                            default=None,
-                            help="wandb display name")
+        parser.add_argument(
+            "--use-wandb", action="store_true", default=False, help="use wandb logger"
+        )
+        parser.add_argument("--wandb.project", default=None, help="wandb project name")
+        parser.add_argument("--wandb.group", default=None, help="wandb group name")
+        parser.add_argument("--wandb.name", default=None, help="wandb display name")
         # parser.add_argument(
         #     '--wandb.path', default=None,
         #     help='wandb directory')
@@ -748,10 +731,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             default=False,
             help="CPU offload of gradients when using fully_sharded_ddp",
         )
-        parser.add_argument("--grad-clip",
-                            type=float,
-                            default=0,
-                            help="gradient clipping norm value")
+        parser.add_argument(
+            "--grad-clip", type=float, default=0, help="gradient clipping norm value"
+        )
         parser.add_argument(
             "--grad-clip-norm",
             default=2,
@@ -764,10 +746,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             default=0,
             help="start epoch for SWA, if 0 it does not use SWA",
         )
-        parser.add_argument("--swa-lr",
-                            type=float,
-                            default=1e-3,
-                            help="learning rate for SWA phase")
+        parser.add_argument(
+            "--swa-lr", type=float, default=1e-3, help="learning rate for SWA phase"
+        )
         parser.add_argument(
             "--swa-anneal-epochs",
             type=int,
@@ -786,7 +767,6 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             )
 
         if prefix is not None:
-            outer_parser.add_argument("--" + prefix,
-                                      action=ActionParser(parser=parser))
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
 
     add_argparse_args = add_class_args
diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py
index eddf47a7..a59cbe14 100644
--- a/hyperion/torch/trainers/xvector_trainer.py
+++ b/hyperion/torch/trainers/xvector_trainer.py
@@ -101,16 +101,16 @@ def train_epoch(self, data_loader):
         metric_acc = MetricAcc(device=self.device)
         batch_metrics = ODict()
         self.model.train()
-        for batch, (data, target) in enumerate(data_loader):
+        for batch, data in enumerate(data_loader):
             self.loggers.on_batch_begin(batch)
 
             if batch % self.grad_acc_steps == 0:
                 self.optimizer.zero_grad()
 
-            input_data, target = tensors_subset(data, batch_keys, self.device)
-            batch_size = input_data.size(0)
+            x, target = tensors_subset(data, batch_keys, self.device)
+            batch_size = x.size(0)
             with amp.autocast(enabled=self.use_amp):
-                output = self.model(input_data, y=target)
+                output = self.model(x, y=target)
                 loss = self.loss(output, target).mean() / self.grad_acc_steps
 
             if self.use_amp:
diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py
index 52474baa..0f6ccd9b 100644
--- a/hyperion/torch/trainers/xvector_trainer_from_wav.py
+++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py
@@ -106,10 +106,10 @@ def train_epoch(self, data_loader):
             if batch % self.grad_acc_steps == 0:
                 self.optimizer.zero_grad()
 
-            input_data, target = tensors_subset(data, batch_keys, self.device)
-            batch_size = input_data.size(0)
+            audio, target = tensors_subset(data, batch_keys, self.device)
+            batch_size = audio.size(0)
             with torch.no_grad():
-                feats, feats_lengths = self.feat_extractor(input_data)
+                feats, feats_lengths = self.feat_extractor(audio)
 
             with amp.autocast(enabled=self.use_amp):
                 output = self.model(feats, feats_lengths, y=target)
@@ -159,10 +159,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
                 self.model.eval()
 
             for batch, data in enumerate(data_loader):
-                input_data, target = tensors_subset(data, batch_keys, self.device)
-                batch_size = input_data.size(0)
+                audio, target = tensors_subset(data, batch_keys, self.device)
+                batch_size = audio.size(0)
 
-                feats, feats_lengths = self.feat_extractor(input_data)
+                feats, feats_lengths = self.feat_extractor(audio)
                 with amp.autocast(enabled=self.use_amp):
                     output = self.model(feats, feats_lengths)
                     loss = self.loss(output, target)
diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py
index 70ee82c8..fe72339f 100644
--- a/hyperion/utils/class_info.py
+++ b/hyperion/utils/class_info.py
@@ -70,8 +70,33 @@ def load(cls, file_path, sep=None):
         if ext == "":
             # if no extension we load as kaldi utt2spk file
             df = pd.read_csv(
-                file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str},
+                file_path,
+                sep=" ",
+                header=None,
+                names=["id"],
+                dtype={"id": np.str},
             )
             return cls(df)
 
         return super().load(file_path, sep)
+
+    @classmethod
+    def cat(cls, tables):
+        """Concatenates several tables.
+
+        Args:
+          info_lists: List of InfoTables
+
+        Returns:
+          InfoTable object concatenation the info_lists.
+        """
+        df_list = [table.df for table in tables]
+        df = pd.concat(df_list)
+        assert df["id"].is_unique, """there are duplicated ids in original tables"""
+        if not df["class_idx"].is_unique:
+            logging.warning(
+                """class_idx in concat tables are not unique, 
+                we will assign new class_idx"""
+            )
+            df["class_idx"].drop(columns=["class_idx"], inplace=True)
+        return cls(df)
diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py
index e6c9e861..0ef81ab6 100644
--- a/hyperion/utils/dataset.py
+++ b/hyperion/utils/dataset.py
@@ -20,21 +20,21 @@
 
 
 class Dataset:
-    """ Class that contains all objects 
-        (segments, recordings, features, class_infos) that 
-        conform a dataset
-
-        Attributes:
-          segments:     SegmentSet object or path to it. 
-          classes:      Dictionary of ClassInfo objects or paths to then 
-          recordings:   Dictionary of RecordingSet objects or paths to then 
-          features:     Dictionary of FeatureSet objects or paths to then 
-          enrollments:  Dictionary of EnrollmentMap objects or paths to then 
-          trials:       Dictionary of TrialKey/TrialNdx/SparseTrialKey objects 
-            or paths to then 
-          sparse_trials: load trial keys using the SparseTrialKey class instead 
-              of TrialKey class.
-          table_sep:    Column separator when reading/writting tables
+    """Class that contains all objects
+    (segments, recordings, features, class_infos) that
+    conform a dataset
+
+    Attributes:
+      segments:     SegmentSet object or path to it.
+      classes:      Dictionary of ClassInfo objects or paths to then
+      recordings:   Dictionary of RecordingSet objects or paths to then
+      features:     Dictionary of FeatureSet objects or paths to then
+      enrollments:  Dictionary of EnrollmentMap objects or paths to then
+      trials:       Dictionary of TrialKey/TrialNdx/SparseTrialKey objects
+        or paths to then
+      sparse_trials: load trial keys using the SparseTrialKey class instead
+          of TrialKey class.
+      table_sep:    Column separator when reading/writting tables
 
     """
 
@@ -70,10 +70,12 @@ def __init__(
             features, FeatureSet
         )
         self._enrollments, self._enrollments_paths = self._parse_dict_args(
-            enrollments, EnrollmentMap,
+            enrollments,
+            EnrollmentMap,
         )
         self._trials, self._trials_paths = self._parse_dict_args(
-            trials, (TrialKey, TrialNdx, SparseTrialKey),
+            trials,
+            (TrialKey, TrialNdx, SparseTrialKey),
         )
 
         self.sparse_trials = sparse_trials
@@ -217,16 +219,41 @@ def save(
         dataset_path: PathLike,
         update_paths: bool = True,
         table_sep: Optional[str] = None,
+        force_save_all: bool = False,
     ):
-        """Saves all the dataset objects.
+        """Saves the dataset to disk.
 
         Args:
-          dataset_path: str/Path indicating directory 
-            to save the dataset or .yaml file to save 
+          dataset_path: str/Path indicating directory
+            to save the dataset or .yaml file to save
             the dataset info.
-          update_paths: whether to update the file_paths in the 
-            data structures in the DateSet object
+          update_paths: whether to update the file_paths in the
+            data structures in the DataSet object
+          force_save_all: forces saving all tables even if they haven't changed,
+                          otherwise, it only saves tables loaded in memory
+                          and those that are not in the datadirectory
+        """
+        if force_save_all:
+            self.save_all(dataset_path, update_paths, table_sep)
+        else:
+            self.save_changed(dataset_path, update_paths, table_sep)
 
+    def save_changed(
+        self,
+        dataset_path: PathLike,
+        update_paths: bool = True,
+        table_sep: Optional[str] = None,
+        force_save_all: bool = False,
+    ):
+        """Saves the tables that change in disk or tables
+           that are not in the ouput directory.
+
+        Args:
+          dataset_path: str/Path indicating directory
+            to save the dataset or .yaml file to save
+            the dataset info.
+          update_paths: whether to update the file_paths in the
+            data structures in the DataSet object
         """
         table_sep = self.table_sep if table_sep is None else table_sep
         if update_paths:
@@ -238,12 +265,139 @@ def save(
         file_name = f"segments{table_ext}"
         dataset["segments"] = file_name
         file_path = dataset_dir / file_name
-        self.segments().save(file_path, sep=table_sep)
+        if (
+            self._segments is not None
+            or file_path != self._segments_path
+            or not file_path.exists()
+        ):
+            self.segments(keep_loaded=False).save(file_path, sep=table_sep)
+            if update_paths:
+                self._segments_path = file_path
+
+        file_names = {}
+        for k in self._recordings.keys():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            if (
+                self._recordings is not None
+                or file_path != self._recordings_paths[k]
+                or not file_path.exists()
+            ):
+                v = self.recordings_value(k, keep_loaded=False)
+                v.save(file_path, sep=table_sep)
+                if update_paths:
+                    self._recordings_paths[k] = file_path
+
+        if file_names:
+            dataset["recordings"] = file_names
+
+        file_names = {}
+        for k in self._features.keys():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            if (
+                self._features is not None
+                or file_path != self._features_paths[k]
+                or not file_path.exists()
+            ):
+                v = self.features_value(k, keep_loaded=False)
+                v.save(file_path, sep=table_sep)
+                if update_paths:
+                    self._features_paths[k] = file_path
+
+        if file_names:
+            dataset["features"] = file_names
+
+        file_names = {}
+        for k, v in self._classes.keys():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            if (
+                self._classes is not None
+                or file_path != self._classes_paths[k]
+                or not file_path.exists()
+            ):
+                v = self.classes_value(k, keep_loaded=False)
+                v.save(file_path, sep=table_sep)
+                if update_paths:
+                    self._classes_paths[k] = file_path
+
+        if file_names:
+            dataset["classes"] = file_names
+
+        file_names = {}
+        for k, v in self._enrollments.keys():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            if (
+                self._enrollments is not None
+                or file_path != self._enrollments_paths[k]
+                or not file_path.exists()
+            ):
+                v = self.enrollments_value(k, keep_loaded=False)
+                v.save(file_path, sep=table_sep)
+                if update_paths:
+                    self._enrollments_paths[k] = file_path
+
+        if file_names:
+            dataset["enrollments"] = file_names
+
+        file_names = {}
+        for k, v in self._trials.keys():
+            file_name = k + table_ext
+            file_names[k] = file_name
+            file_path = dataset_dir / file_name
+            if (
+                self._trials is not None
+                or file_path != self._trials_paths[k]
+                or not file_path.exists()
+            ):
+                v = self.trials_value(k, keep_loaded=False)
+                v.save(file_path)
+                if update_paths:
+                    self._trials_paths[k] = file_path
+
+        if file_names:
+            dataset["trials"] = file_names
+
+        with open(dataset_file, "w") as f:
+            yaml.dump(dataset, f)
+
+    def save_all(
+        self,
+        dataset_path: PathLike,
+        update_paths: bool = True,
+        table_sep: Optional[str] = None,
+    ):
+        """Saves all the dataset objects.
+
+        Args:
+          dataset_path: str/Path indicating directory
+            to save the dataset or .yaml file to save
+            the dataset info.
+          update_paths: whether to update the file_paths in the
+            data structures in the DataSet object
+        """
+        table_sep = self.table_sep if table_sep is None else table_sep
+        if update_paths:
+            self.table_sep = table_sep
+
+        table_ext = ".tsv" if table_sep == "\t" else ".csv"
+        dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
+        dataset = {}
+        file_name = f"segments{table_ext}"
+        dataset["segments"] = file_name
+        file_path = dataset_dir / file_name
+        self.segments(keep_loaded=False).save(file_path, sep=table_sep)
         if update_paths:
             self._segments_path = file_path
 
         file_names = {}
-        for k, v in self.recordings():
+        for k, v in self.recordings(keep_loaded=False):
             file_name = k + table_ext
             file_names[k] = file_name
             file_path = dataset_dir / file_name
@@ -255,7 +409,7 @@ def save(
             dataset["recordings"] = file_names
 
         file_names = {}
-        for k, v in self.features():
+        for k, v in self.features(keep_loaded=False):
             file_name = k + table_ext
             file_names[k] = file_name
             file_path = dataset_dir / file_name
@@ -267,7 +421,7 @@ def save(
             dataset["features"] = file_names
 
         file_names = {}
-        for k, v in self.classes():
+        for k, v in self.classes(keep_loaded=False):
             file_name = k + table_ext
             file_names[k] = file_name
             file_path = dataset_dir / file_name
@@ -279,7 +433,7 @@ def save(
             dataset["classes"] = file_names
 
         file_names = {}
-        for k, v in self.enrollments():
+        for k, v in self.enrollments(keep_loaded=False):
             file_name = k + table_ext
             file_names[k] = file_name
             file_path = dataset_dir / file_name
@@ -291,7 +445,7 @@ def save(
             dataset["enrollments"] = file_names
 
         file_names = {}
-        for k, v in self.trials():
+        for k, v in self.trials(keep_loaded=False):
             file_name = k + table_ext
             file_names[k] = file_name
             file_path = dataset_dir / file_name
@@ -329,8 +483,8 @@ def load(
         """Loads all the dataset objects.
 
         Args:
-         dataset_path: str/Path indicating directory 
-          to save the dataset or .yaml file to save 
+         dataset_path: str/Path indicating directory
+          to save the dataset or .yaml file to save
           the dataset info.
          lazy: load data structures lazily when they are needed.
          sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class
@@ -386,34 +540,64 @@ def load(
 
         return dataset
 
-        # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
-        # with open(dataset_file, "w") as f:
-        #     dataset = yaml.safe_load(f)
-
-        # assert "segments" in dataset
-        # segments = SegmentSet.load(
-        #     Dataset.resolve_file_path(dataset_dir, dataset["segments"])
-        # )
-        # classes = None
-        # recordings = None
-        # features = None
-        # if "classes" in dataset:
-        #     classes = {}
-        #     for k, v in dataset["classes"]:
-        #         classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v))
-
-        # if "recordings" in dataset:
-        #     recordings = {}
-        #     for k, v in dataset["recordings"]:
-        #         recordings[k] = RecordingSet.load(
-        #             Dataset.resolve_file_path(dataset_dir, v)
-        #         )
-
-        # if "features" in dataset:
-        #     features = {}
-        #     for k, v in dataset["features"]:
-        #         features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v))
-
-        # dataset = cls(segments, classes, recordings, features)
-        # if not lazy:
-        #     dataset.update_from_disk()
+    def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]):
+        if isinstance(features, (str, Path)):
+            self._features[features_name] = None
+            self._features_paths[features_name] = features
+        elif isinstance(features, FeatureSet):
+            self._features[features_name] = features
+            self._features_paths[features_name] = None
+        else:
+            raise ValueError()
+
+    def add_recordings(
+        self,
+        recordings_name: str,
+        recordings: Union[PathLike, RecordingSet],
+    ):
+        if isinstance(features, (str, Path)):
+            self._recordings[features_name] = None
+            self._recordings_paths[recordings_name] = recordings
+        elif isinstance(recordings, RecordingSet):
+            self._recordings[recordings_name] = recordings
+            self._recordings_paths[recordings_name] = None
+        else:
+            raise ValueError()
+
+    def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]):
+        if isinstance(classes, (str, Path)):
+            self._classes[features_name] = None
+            self._classes_paths[classes_name] = classes
+        elif isinstance(classes, ClassInfo):
+            self._classes[classes_name] = classes
+            self._classes_paths[classes_name] = None
+        else:
+            raise ValueError()
+
+    def add_enrollments(
+        self,
+        enrollments_name: str,
+        enrollments: Union[PathLike, EnrollmentMap],
+    ):
+        if isinstance(features, (str, Path)):
+            self._enrollments[features_name] = None
+            self._enrollments_paths[enrollments_name] = enrollments
+        elif isinstance(enrollments, EnrollmentMap):
+            self._enrollments[enrollments_name] = enrollments
+            self._enrollments_paths[enrollments_name] = None
+        else:
+            raise ValueError()
+
+    def add_trials(
+        self,
+        trials_name: str,
+        trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey],
+    ):
+        if isinstance(features, (str, Path)):
+            self._trials[features_name] = None
+            self._trials_paths[trials_name] = trials
+        elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)):
+            self._trials[trials_name] = trials
+            self._trials_paths[trials_name] = None
+        else:
+            raise ValueError()
diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py
index 024e5b74..4af69144 100644
--- a/hyperion/utils/enrollment_map.py
+++ b/hyperion/utils/enrollment_map.py
@@ -18,12 +18,13 @@
 
 class EnrollmentMap(InfoTable):
     """Class to store the mapping between enrollment id
-       and segmentids
+    and segmentids
     """
 
     def __init__(self, df):
         if "modelid" in df:
             df.rename(columns={"modelid": "id"}, inplace=True)
+        assert "segmentid" in df
         super().__init__(df)
 
     def split(self, idx, num_parts):
@@ -84,3 +85,17 @@ def load(cls, file_path, sep=None):
             df = pd.read_csv(file_path, sep=sep)
 
         return cls(df)
+
+    @classmethod
+    def cat(cls, tables):
+        """Concatenates several tables.
+
+        Args:
+          info_lists: List of InfoTables
+
+        Returns:
+          InfoTable object concatenation the info_lists.
+        """
+        df_list = [table.df for table in tables]
+        df = pd.concat(df_list)
+        return cls(df)
diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py
index 6bcd4aca..45eab05f 100644
--- a/hyperion/utils/info_table.py
+++ b/hyperion/utils/info_table.py
@@ -176,8 +176,8 @@ def split(self, idx, num_parts, group_by=None):
         return self.__class__(df)
 
     @classmethod
-    def merge(cls, tables):
-        """Merges several tables.
+    def cat(cls, tables):
+        """Concatenates several tables.
 
         Args:
           info_lists: List of InfoTables
@@ -187,6 +187,9 @@ def merge(cls, tables):
         """
         df_list = [table.df for table in tables]
         df = pd.concat(df_list)
+        assert df[
+            "id"
+        ].is_unique, """there are duplicated ids in the tables we are concatenating"""
         return cls(df)
 
     def filter(self, items=None, iindex=None, columns=None, by="id", keep=True):

From 7ea0eb08c2f74d1e57c4c77f1bba15201967f275 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-78-128.ec2.internal>
Date: Sun, 4 Jun 2023 06:29:50 +0000
Subject: [PATCH 44/89] update lid training for focal loss and hard negative
 sampling

---
 ...c2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml | 68 +++++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml | 71 ++++++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml | 73 +++++++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml | 70 ++++++++++++++++++
 ...2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml | 40 ++++++++++
 .../v1/global_conf/config_lid_v6.0_13langs.sh | 42 +++++++++++
 .../v1/global_conf/config_lid_v6.2_13langs.sh | 42 +++++++++++
 .../v1/global_conf/config_lid_v6.3_13langs.sh | 42 +++++++++++
 .../v1/global_conf/config_lid_v6.4_13langs.sh | 42 +++++++++++
 .../v1/local/initailize_lid_model.py          |  6 +-
 hyperion/bin/finetune_wav2vec2languageid.py   | 27 ++++++-
 hyperion/bin/train_wav2vec2languageid.py      | 32 +++++++-
 hyperion/torch/losses/__init__.py             |  1 +
 hyperion/torch/losses/focal_loss.py           | 48 ++++++++++++
 .../hf_wav2vec2rnn_film_transducer.py         |  1 +
 15 files changed, 599 insertions(+), 6 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
 create mode 100644 hyperion/torch/losses/focal_loss.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml
new file mode 100644
index 00000000..dc654278
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml
@@ -0,0 +1,68 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml
new file mode 100644
index 00000000..962af029
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  loss: weightedCE
+  loss_weight_exp: 0.5 # 0~1
+  # focal_loss_gamma: 2.0
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml
new file mode 100644
index 00000000..3918b04f
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  loss: weightedCE
+  loss_weight_exp: 0.5 # 0~1
+  # focal_loss_gamma: 2.0
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml
new file mode 100644
index 00000000..17a13388
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml
@@ -0,0 +1,70 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
new file mode 100644
index 00000000..c40bcb1f
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml
@@ -0,0 +1,40 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+languageid:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 1024
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 8
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 3072
+    hid_act: swish
+    dropout_rate: 0.2
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 128
+  loss_type: arc-softmax
+  cos_scale: 32.0
+  margin: 0.
+  margin_warmup_epochs: 5
+  intertop_margin: 0.
+  dropout_rate: 0.3
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
new file mode 100644
index 00000000..ebbd7fd1
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v6.0_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0034.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v6.0_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
new file mode 100644
index 00000000..57fb5d0b
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v6.2_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0034.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v6.2_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
new file mode 100644
index 00000000..d1847910
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v6.3_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0034.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v6.3_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
new file mode 100644
index 00000000..88190921
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v6.4_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0034.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.4.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v6.4_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py
index 9a2c1a06..22e32bed 100644
--- a/egs/commonvoice/v1/local/initailize_lid_model.py
+++ b/egs/commonvoice/v1/local/initailize_lid_model.py
@@ -5,7 +5,7 @@
 # LID_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth"
 # output_model = "model_initialized.pth"
 
-# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s1/model_ep0003.pth  /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s3/model_ep0001.pth 
+# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0008.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s1/model_ep0034.pth  /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s3/model_ep0001.pth 
 
 ASR_model = torch.load(sys.argv[1])
 LID_model = torch.load(sys.argv[2])
@@ -16,6 +16,8 @@
 def copy_model_parameters(ASR_model, LID_model):
     ASR_state_dict = ASR_model["model_state_dict"]
     LID_state_dict = LID_model["model_state_dict"]
+    
+    #ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} 
 
     update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name}
     # remove feature fuser
@@ -46,4 +48,4 @@ def copy_model_parameters(ASR_model, LID_model):
 
 
 
-copy_model_parameters(ASR_model, LID_model)
\ No newline at end of file
+copy_model_parameters(ASR_model, LID_model)
diff --git a/hyperion/bin/finetune_wav2vec2languageid.py b/hyperion/bin/finetune_wav2vec2languageid.py
index 4ac24e98..0403f84c 100755
--- a/hyperion/bin/finetune_wav2vec2languageid.py
+++ b/hyperion/bin/finetune_wav2vec2languageid.py
@@ -31,6 +31,11 @@
 from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID
 from torch.nn.utils.rnn import pad_sequence
 
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
+
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID,
     # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID,
@@ -94,8 +99,9 @@ def init_data(partition, rank, num_gpus, **kwargs):
     } if num_gpus > 0 else {})
     data_loader = torch.utils.data.DataLoader(dataset,
                                               batch_sampler=sampler,
-                                              **largs,
-                                              collate_fn=Language_collate)
+                                              **largs)
+                                            #   ,
+                                            #   collate_fn=Language_collate)
     return data_loader
 
 
@@ -114,6 +120,21 @@ def init_model(num_classes, in_model_file, rank, model_class, **kwargs):
     return model
 
 
+def init_hard_prototype_mining(model, train_loader, val_loader, rank):
+    if not train_loader.batch_sampler.hard_prototype_mining:
+        return
+
+    if rank == 0:
+        logging.info("setting hard prototypes")
+
+    affinity_matrix = model.compute_prototype_affinity()
+    train_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
+    if not val_loader.batch_sampler.hard_prototype_mining:
+        return
+
+    val_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
 def train_model(gpu_id, args):
 
     config_logger(args.verbose)
@@ -138,6 +159,7 @@ def train_model(gpu_id, args):
     val_loader = init_data(partition="val", **kwargs)
 
     model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+    init_hard_prototype_mining(model, train_loader, val_loader, rank)
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
@@ -148,6 +170,7 @@ def train_model(gpu_id, args):
         device=device,
         metrics=metrics,
         ddp=world_size > 1,
+        # loss_weight=train_loader.batch_sampler.class_info["weights"],
         **trn_args,
     )
     trainer.load_last_checkpoint()
diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py
index 7af47d03..680ddd61 100755
--- a/hyperion/bin/train_wav2vec2languageid.py
+++ b/hyperion/bin/train_wav2vec2languageid.py
@@ -23,6 +23,7 @@
 
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.utils import ddp
+
 from hyperion.torch.trainers import LanguageIDTrainer as Trainer
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
@@ -30,6 +31,11 @@
 from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID
 from torch.nn.utils.rnn import pad_sequence
 
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
+
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID,
     # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID,
@@ -93,8 +99,9 @@ def init_data(partition, rank, num_gpus, **kwargs):
     } if num_gpus > 0 else {})
     data_loader = torch.utils.data.DataLoader(dataset,
                                               batch_sampler=sampler,
-                                              **largs,
-                                              collate_fn=Language_collate)
+                                              **largs)
+                                            #   ,
+                                            #   collate_fn=Language_collate)
     return data_loader
 
 
@@ -109,6 +116,23 @@ def init_model(num_classes, rank, model_class, **kwargs):
     return model
 
 
+
+def init_hard_prototype_mining(model, train_loader, val_loader, rank):
+    if not train_loader.batch_sampler.hard_prototype_mining:
+        return
+
+    if rank == 0:
+        logging.info("setting hard prototypes")
+
+    affinity_matrix = model.compute_prototype_affinity()
+    train_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
+    if not val_loader.batch_sampler.hard_prototype_mining:
+        return
+
+    val_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
+
 def train_model(gpu_id, args):
 
     config_logger(args.verbose)
@@ -129,6 +153,7 @@ def train_model(gpu_id, args):
     # device = "cpu"
     # world_size=1
 
+    # import pdb; pdb.set_trace()
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
 
@@ -138,14 +163,17 @@ def train_model(gpu_id, args):
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
     metrics = {"acc": CategoricalAccuracy()}
+    # import pdb; pdb.set_trace()
     trainer = Trainer(
         model,
         device=device,
         metrics=metrics,
         ddp=world_size > 1,
+        loss_weight=train_loader.batch_sampler.class_info["weights"],
         **trn_args,
     )
     trainer.load_last_checkpoint()
+    init_hard_prototype_mining(trainer.model, train_loader, val_loader, rank)
     trainer.fit(train_loader, val_loader)
 
     ddp.ddp_cleanup()
diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py
index bf3ce279..55cc2f52 100644
--- a/hyperion/torch/losses/__init__.py
+++ b/hyperion/torch/losses/__init__.py
@@ -4,3 +4,4 @@
 """
 
 from .bce_with_llr import BCEWithLLR
+from .focal_loss import FocalLoss
\ No newline at end of file
diff --git a/hyperion/torch/losses/focal_loss.py b/hyperion/torch/losses/focal_loss.py
new file mode 100644
index 00000000..f2a0d32a
--- /dev/null
+++ b/hyperion/torch/losses/focal_loss.py
@@ -0,0 +1,48 @@
+from torch import nn
+import torch
+from torch.nn import functional as F
+import logging
+class FocalLoss(nn.Module):
+    def __init__(self, alpha=0.25, gamma=2, size_average=True):
+        """
+        Focal loss implementation: -alpha(1-yi)**gamma * ce_loss(xi,yi)
+        
+        :param alpha: scalar or list. Class weights. If scalar, the same weight applies for all classes.
+        :param gamma: scalar. Difficult-to-easy sample regulation parameter.
+        :param size_average: bool. Whether to average the loss over the batch.
+        :param device: str. Device to place the tensors.
+        """
+        super(FocalLoss,self).__init__()
+        self.gamma = gamma
+        self.size_average = size_average
+        self.alpha = alpha
+        logging.info("FocalLoss: alpha={}, gamma={}, size_average={}".format(alpha, gamma, size_average))
+        
+    def forward(self, preds, labels):
+        """
+        Compute the focal loss.
+        
+        :param preds: Predicted classes. size:[B,N,C] or [B,C]
+        :param labels: Actual classes. size:[B,N] or [B]
+        :return: scalar. Loss value.
+        """
+        preds = preds.view(-1, preds.size(-1))
+        preds_logsoft = F.log_softmax(preds, dim=1)
+        preds_softmax = torch.exp(preds_logsoft)
+
+        preds_softmax = preds_softmax.gather(1, labels.view(-1, 1))
+        preds_logsoft = preds_logsoft.gather(1, labels.view(-1, 1))
+        
+        if isinstance(self.alpha, torch.Tensor):
+            alpha = self.alpha.gather(0, labels.view(-1))
+        else:  # if alpha is a scalar
+            alpha = self.alpha
+
+        loss = -torch.mul(torch.pow((1 - preds_softmax), self.gamma), preds_logsoft)
+
+        loss = torch.mul(alpha, loss.t())
+        if self.size_average:
+            loss = loss.mean()
+        else:
+            loss = loss.sum()
+        return loss
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
index 513d193c..9ee37287 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py
@@ -12,6 +12,7 @@
 from ...tpm import HFWav2Vec2
 from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer
 from ..transducer import RNNFiLMTransducer
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
 
 class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer):
     """Class for RNN-T with Wav2Vec2 features

From aed329beeb9791c5992363a937bfa42f41b1f294 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-67-183.ec2.internal>
Date: Sun, 4 Jun 2023 20:38:35 +0000
Subject: [PATCH 45/89] update film transducer decoder for original joiner

---
 .../narchs/rnn_film_transducer_decoder.py     | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index e070f70b..e655581a 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -168,7 +168,10 @@ def get_config(self):
     def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor,
                               y: torch.Tensor, y_lengths: torch.Tensor,
                               pred_out: torch.Tensor, lang_embedding: torch.Tensor):
-        logits = self.joiner(x, pred_out, lang_embedding)
+        if self.joiner_args["joiner_type"] == "original_joiner":
+            logits = self.joiner(x, pred_out)
+        else:
+            logits = self.joiner(x, pred_out, lang_embedding)
         # rnnt_loss requires 0 padded targets
         # Note: y does not start with SOS
         y_padded = y.pad(mode="constant", padding_value=0)
@@ -194,7 +197,10 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor,
         boundary[:, 2] = y_lengths
         boundary[:, 3] = x_lengths
 
-        logits = self.joiner(x, pred_out, lang_embedding)
+        if self.joiner_args["joiner_type"] == "original_joiner":
+            logits = self.joiner(x, pred_out)
+        else:
+            logits = self.joiner(x, pred_out, lang_embedding)
 
         with torch.cuda.amp.autocast(enabled=False):
             loss = k2.rnnt_loss(
@@ -257,7 +263,11 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor,
 
         # project_input=False since we applied the decoder's input projections
         # prior to do_rnnt_pruning (this is an optimization for speed).
-        logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False)
+
+        if self.joiner_args["joiner_type"] == "original_joiner":
+            logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+        else:
+            logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False)
 
 
         with torch.cuda.amp.autocast(enabled=False):
@@ -374,7 +384,11 @@ def decode_greedy(self,
 
         while t < T and sym_per_utt < max_sym_per_utt:
             x_t = x[:, t:t + 1, :]
-            logits = self.joiner(x_t, pred_out, lang_embedding)  # (1, 1, 1, vocab_size)
+            
+            if self.joiner_args["joiner_type"] == "original_joiner":
+                logits = self.joiner(x_t, pred_out)
+            else:
+                logits = self.joiner(x_t, pred_out, lang_embedding)  # (1, 1, 1, vocab_size)
             # logits is
 
             log_prob = logits.log_softmax(dim=-1)  # (1, 1, 1, vocab_size)
@@ -442,7 +456,10 @@ def decode_time_sync_beam_search(self,
                 else:
                     pred_out, pred_state = cache[cached_key]
 
-                logits = self.joiner(x_t, pred_out, lang_embedding)
+                if self.joiner_args["joiner_type"] == "original_joiner":
+                    logits = self.joiner(x_t, pred_out)
+                else:
+                    logits = self.joiner(x_t, pred_out, lang_embedding)
                 log_prob = logits.log_softmax(dim=-1)
                 # log_prob is (1, 1, 1, vocab_size)
                 log_prob = log_prob.squeeze()
@@ -570,7 +587,10 @@ def decode_align_length_sync_beam_search(
                 else:
                     pred_out, pred_state = cache[cached_key]
 
-                logits = self.joiner(x_t, pred_out, lang_embedding)
+                if self.joiner_args["joiner_type"] == "original_joiner":
+                    logits = self.joiner(x_t, pred_out)
+                else:
+                    logits = self.joiner(x_t, pred_out, lang_embedding)
                 log_prob = logits.log_softmax(dim=-1)  # (1, 1, 1, vocab_size)
                 log_prob = log_prob.squeeze()  # (vocab_size,)
 

From 5ec0dc71e66dcfacf2cbcebee3594cd9fd25d6c1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-76-85.ec2.internal>
Date: Mon, 5 Jun 2023 08:07:13 +0000
Subject: [PATCH 46/89] add different loss for lid

---
 .../bin/train_wav2vec2rnn_film_transducer.py  |  3 ++-
 .../data/class_weighted_seg_chunk_sampler.py  |  2 +-
 hyperion/torch/trainers/languageid_trainer.py | 27 +++++++++++++++++--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
index f06cc684..2306b467 100755
--- a/hyperion/bin/train_wav2vec2rnn_film_transducer.py
+++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py
@@ -27,6 +27,7 @@
                           namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
+
 model_dict = {
     "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
     "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer,
@@ -225,7 +226,7 @@ def make_parser(model_class):
     parser.link_arguments("data.train.dataset.bpe_model",
                           "data.val.dataset.bpe_model")
 
-    parser.add_argument("--in-model-file", required=True)
+    # parser.add_argument("--in-model-file", required=True)
     model_class.add_class_args(parser, prefix="model")
 
     Trainer.add_class_args(parser,
diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
index 8ec63b6f..afb663d5 100644
--- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
+++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
@@ -245,7 +245,7 @@ def set_hard_prototypes(self, affinity_matrix):
         ).indices
 
     def get_hard_prototypes(self, class_idx):
-        return self.hard_prototypes[class_idx].flatten().numpy()
+        return self.hard_prototypes[class_idx].flatten().cpu().numpy()
 
     def _sample_chunk_length(self):
         if self.var_batch_size:
diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py
index add56c1e..ef252693 100644
--- a/hyperion/torch/trainers/languageid_trainer.py
+++ b/hyperion/torch/trainers/languageid_trainer.py
@@ -15,6 +15,8 @@
 from ...utils.misc import filter_func_args
 from ..utils import MetricAcc, tensors_subset
 from .torch_trainer import TorchTrainer
+# from ..losses.focal_loss import FocalLoss
+# from torchvision.ops.focal_loss import sigmoid_focal_loss
 
 
 class LanguageIDTrainer(TorchTrainer):
@@ -78,10 +80,17 @@ def __init__(
         cpu_offload=False,
         input_key="x",
         target_key="language",
+        loss_weight=None,
+        loss_weight_exp=0.5,
     ):
 
-        if loss is None:
+        if loss == "CE" or loss is None:
             loss = nn.CrossEntropyLoss()
+        elif loss == "weightedCE":
+            loss = nn.CrossEntropyLoss(weight=torch.tensor(loss_weight.values, dtype=torch.float).to(device)**(-loss_weight_exp))
+            logging.info(torch.tensor(loss_weight.values).to(device)**(-loss_weight_exp))
+        elif loss == "focal_loss":
+            loss = FocalLoss(alpha=torch.tensor(focal_weight.values).to(device)**(-loss_weight_exp), gamma=2, size_average=True)
         super_args = filter_func_args(super().__init__, locals())
         super().__init__(**super_args)
 
@@ -195,6 +204,11 @@ def validation_epoch(self, data_loader, swa_update_bn=False):
         logs = ODict((log_tag + k, v) for k, v in logs.items())
         return logs
 
+    @staticmethod
+    def filter_args(**kwargs):
+        args = filter_func_args(LanguageIDTrainer.__init__, kwargs)
+        return args
+
     @staticmethod
     def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
         if prefix is not None:
@@ -210,7 +224,16 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()):
             parser.add_argument("--target-key",
                                 default="language",
                                 help="dict. key for nnet targets")
-
+        if "loss" not in skip:
+            parser.add_argument("--loss",
+                                default=None,
+                                choices=["CE", "weightedCE", "focal_loss"],
+                                help="loss function")
+        if "loss_weight_exp" not in skip:
+            parser.add_argument("--loss-weight-exp",
+                                default=0.5,
+                                type=float,
+                                help="focal loss weight exponent")
         if prefix is not None:
             outer_parser.add_argument("--" + prefix,
                                       action=ActionParser(parser=parser))

From 22920dca86a8c20afc0764564026d4c9826a096e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Thu, 8 Jun 2023 04:10:02 +0000
Subject: [PATCH 47/89] add new training mode: film-ft, and add lid-film-asr
 system

---
 ..._wav2vec2rnn_film_transducer_languageid.py | 274 ++++++++++++++++++
 hyperion/torch/models/__init__.py             |   2 +-
 .../hf_wav2rnn_film_transducer.py             |  13 +
 .../narchs/rnn_film_transducer_decoder.py     |  16 +-
 4 files changed, 302 insertions(+), 3 deletions(-)
 create mode 100755 hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py

diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py
new file mode 100755
index 00000000..d5a6ad6f
--- /dev/null
+++ b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu, Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import k2
+import numpy as np
+import torch
+import torch.nn as nn
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
+                                   HFWav2Vec2RNNTransducer,
+                                   HFWav2Vec2RNNFiLMTransducer,
+                                   HFWav2Vec2RNNTransducerResnet1D,
+                                   HFWav2Vec2RNNFiLMTransducerResnet1D)
+from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from torch.nn.utils.rnn import pad_sequence
+
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
+
+model_dict = {
+    "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D,
+}
+
+
+def transducer_language_collate(batch):
+    audio = []
+    audio_length = []
+    target = []
+    language = []
+    for record in batch:
+        wav = torch.as_tensor(record["x"])
+        audio.append(wav)
+        audio_length.append(wav.shape[0])
+        target.append(record["text"])
+        language.append(record["language"])
+    audio = pad_sequence(audio).transpose(0, 1)
+    audio_length = torch.as_tensor(audio_length)
+
+    # sort audios by length
+    sort_idx = torch.argsort(audio_length, descending=True)
+    audio = audio[sort_idx]
+    audio_length = audio_length[sort_idx]
+    target = [target[k] for k in sort_idx]
+    target = k2.RaggedTensor(target)
+
+    language = [language[k] for k in sort_idx]
+    language = torch.as_tensor(language)
+
+    # FiLM: add language ID to the input
+    batch = {
+        "x": audio,
+        "x_lengths": audio_length,
+        "text": target,
+        "language": language,
+    }
+    return batch
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+    data_kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**data_kwargs["dataset"])
+    sampler_args = data_kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = data_kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = ({
+        "num_workers": num_workers_per_gpu,
+        "pin_memory": True
+    } if num_gpus > 0 else {})
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=sampler,
+                                              **largs,
+                                              collate_fn=transducer_language_collate)
+    return data_loader
+
+def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs):
+    model_args = model_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("model network args={}".format(model_args))
+    # TODO: check model_args
+    model_args["transducer"]["decoder"]["blank_id"] = blank_id
+    model_args["transducer"]["decoder"]["vocab_size"] = vocab_size
+    model_args["languageid"]["num_classes"] = num_classes
+    model_args["loss_class_weight"] = loss_class_weight
+    model = model_class(**model_args)
+    if rank == 0:
+        logging.info("model={}".format(model))
+    return model
+
+
+def train_model(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+    #torch.backends.cudnn.deterministic = True
+    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.enabled = False
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = torch.device("cuda:{}".format(gpu_id))
+    # world_size=1
+
+    # import pdb; pdb.set_trace()
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    # model = init_model_from_transducer(**kwargs)
+    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
+                       train_loader.dataset.sp.get_piece_size(),
+                       list(train_loader.dataset.num_classes.values())[0],
+                       train_loader.batch_sampler.class_info["weights"],
+                        **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    # import pdb; pdb.set_trace()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(model_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    train_parser = ArgumentParser(prog="")
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train",
+                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+
+    parser.add_argument(
+        "--data.train.dataset.text_file",
+        type=str,
+    )
+
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
+    parser.add_argument(
+        "--data.train.dataset.bpe_model",
+        type=str,
+    )
+
+    parser.link_arguments("data.train.data_loader.num_workers",
+                          "data.val.data_loader.num_workers")
+
+    parser.link_arguments("data.train.dataset.bpe_model",
+                          "data.val.dataset.bpe_model")
+
+    # parser.add_argument("--in-model-file", required=True)
+    model_class.add_class_args(parser, prefix="model")
+
+    Trainer.add_class_args(parser,
+                           prefix="trainer",
+                           train_modes=model_class.valid_train_modes())
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1123581321,
+                        help="random seed")
+    parser.add_argument("-v",
+                        "--verbose",
+                        dest="verbose",
+                        default=1,
+                        choices=[0, 1, 2, 3],
+                        type=int)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Train Wav2Vec2Transducer model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+
+    for k, v in model_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    model_type = args.subcommand
+    args_sc = vars(args)[model_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.model_class = model_dict[model_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_model(gpu_id, args_sc)
diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py
index 419ea742..62215e57 100644
--- a/hyperion/torch/models/__init__.py
+++ b/hyperion/torch/models/__init__.py
@@ -13,7 +13,7 @@
                              HFWav2Vec2RNNFiLMTransducer)
 from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector,
                            HFWavLM2ResNet1dXVector)
-from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
+from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D, HFWav2Vec2RNNFiLMTransducerResnet1D
 from .xvectors.efficient_net_xvector import EfficientNetXVector
 from .xvectors.resnet1d_xvector import ResNet1dXVector
 from .xvectors.resnet_xvector import ResNetXVector
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 3f44c7c5..24efb44e 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -241,6 +241,12 @@ def infer(self,
                                   max_sym_per_utt=max_sym_per_utt)
         return y
 
+    def unfreeze_film(self):
+        for name, param in self.named_parameters():
+            if "film" in name:
+                logging.info(f"unfreezing {name}")
+                param.requires_grad = True
+
     def freeze_feat_fuser(self):
         if self.feat_fuser is None:
             return
@@ -266,6 +272,9 @@ def set_train_mode(self, mode):
             self.unfreeze()
         elif mode == "frozen":
             self.freeze()
+        elif mode in ["ft-film", "ft-film-grad"]:
+            self.freeze()
+            self.unfreeze_film()
         elif mode in ["ft-transducer", "ft-transducer-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
@@ -294,8 +303,10 @@ def _train(self, train_mode: str):
         if train_mode in ["full", "frozen"]:
             super()._train(train_mode)
         elif train_mode in [
+                "ft-film",
                 "ft-transducer",
                 "hf-feats-frozen",
+                "ft-film-grad",
                 "ft-transducer-nograd",
                 "hf-feats-frozen-nograd",
                 "hf-feat-extractor-frozen",
@@ -310,8 +321,10 @@ def valid_train_modes():
         return [
             "full",
             "frozen",
+            "ft-film",
             "ft-embed-affine",
             "ft-transducer",
+            "ft-film-grad",
             "hf-feats-frozen",
             "ft-transducer-nograd",
             "hf-feats-frozen-nograd",
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index e655581a..1ccac6a9 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -28,7 +28,7 @@
 
 @dataclass
 class Hypothesis:
-    ys: List[int]  # predicted sequences
+    ys: List[int]  # lid_pred sequences
     log_prob: float  # log prob of ys
 
     # Optional LSTM predictor state.
@@ -78,6 +78,7 @@ def __init__(
         pruned_warmup_steps: int = 2000,
         langs_size: int = 13,
         condition_size: int = 64,
+        film_type: str = "one-hot",
     ):
 
         super().__init__()
@@ -96,12 +97,16 @@ def __init__(
         self.simple_loss_scale = simple_loss_scale
         self.pruned_warmup_steps = pruned_warmup_steps
         self.condition_size = condition_size
+        self.film_type = film_type
 
 
         self._make_predictor()
         self._make_joiner()
         # make embedding layer for language id
-        self.lang_embedding = nn.Embedding(langs_size, condition_size)
+        if self.film_type == "one-hot":
+            self.lang_embedding = nn.Embedding(langs_size, condition_size)
+        elif self.film_type == "lid_pred":
+            self.lang_embedding = nn.Linear(langs_size, condition_size)
         if self.rnnt_loss == "k2_pruned":
             self.simple_am_proj = nn.Linear(in_feats, vocab_size)
             self.simple_lm_proj = nn.Linear(self.predictor.out_feats,
@@ -161,6 +166,7 @@ def get_config(self):
             "simple_loss_scale": self.simple_loss_scale,
             "pruned_warmup_steps": self.pruned_warmup_steps,
             "condition_size": self.condition_size,
+            "film_type": self.film_type,
         }
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -815,6 +821,12 @@ def add_class_args(parser,
                             type=int,
                             required=True,
                             help=("condition vector dimension"))
+                            
+        parser.add_argument("--film-type",
+                            default="one-hot",
+                            choices=["one-hot", "lid_pred"],
+                            help=("type of the condition of FiLM layer"))
+
 
         parser.add_argument(
             "--lm-scale",

From e12e9f5f4ffe2439d97911129c93fb1f04fa2f99 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Thu, 8 Jun 2023 04:15:57 +0000
Subject: [PATCH 48/89] update configuration

---
 ...2base_rnnt_film_k2_pruned_stage3_v4.0.yaml | 92 +++++++++++++++++++
 ...2base_rnnt_film_k2_pruned_stage3_v4.2.yaml | 92 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v4.0_13langs.sh | 45 +++++++++
 ...g_pruned_filmed_transducer_v4.2_13langs.sh | 45 +++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml
new file mode 100644
index 00000000..48ad726c
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm_residual
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml
new file mode 100644
index 00000000..db1005b1
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh
new file mode 100644
index 00000000..6391fc98
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh
new file mode 100644
index 00000000..5de2bb92
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From 7e1fdf8f84630fd55216da00fc084173e236d80a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Thu, 8 Jun 2023 04:24:35 +0000
Subject: [PATCH 49/89] update model

---
 hyperion/torch/models/wav2transducer_languageid/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hyperion/torch/models/wav2transducer_languageid/__init__.py b/hyperion/torch/models/wav2transducer_languageid/__init__.py
index 98ebfdc7..bc785608 100644
--- a/hyperion/torch/models/wav2transducer_languageid/__init__.py
+++ b/hyperion/torch/models/wav2transducer_languageid/__init__.py
@@ -4,4 +4,5 @@
 
 """
 
-from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
\ No newline at end of file
+from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D
+from .hf_wav2vec2rnn_film_transducer_languageid import HFWav2Vec2RNNFiLMTransducerResnet1D
\ No newline at end of file

From 4dfe23cd188acef302e5f8f73cf07ea917606296 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Thu, 8 Jun 2023 04:25:40 +0000
Subject: [PATCH 50/89] update model for film_transducer_lid

---
 .../hf_wav2rnn_film_transducer_languageid.py  | 578 ++++++++++++++++++
 ..._wav2vec2rnn_film_transducer_languageid.py | 171 ++++++
 2 files changed, 749 insertions(+)
 create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
 create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py

diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
new file mode 100644
index 00000000..d967702a
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -0,0 +1,578 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import contextlib
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ....utils import HypDataClass
+from ...torch_model import TorchModel
+from ...utils import remove_silence
+from ..transducer import RNNTransducer, RNNFiLMTransducer, RNNTransducerOutput
+from .hf_wav2rnn_transducer_languageid import RNNTransducerLanguageIDOutput
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
+from ...layer_blocks import FiLM
+
+
+class HFWav2RNNFiLMTransducerLanguageID(TorchModel):
+    """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor.
+
+    Attributes:
+       hf_feats: hugging face model wrapper object.
+       transducer: transducer model object.
+       languageid: language identification model object.
+       feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to
+                          the wav2vec "num_layers".
+       feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(self,
+                 hf_feats: TorchModel,
+                 transducer: Union[Dict, TorchModel],
+                 languageid: Union[Dict, TorchModel],
+                 feat_fusion_start_transducer: int = 0,
+                 feat_fusion_start_lid: int = 0,
+                 feat_fusion_method_transducer: str = "film-weighted-avg",
+                 feat_fusion_method_lid: str = "weighted-avg",
+                 loss_lid_type: str = "weightedCE",
+                 loss_class_weight: Optional[torch.Tensor] = None,
+                 loss_class_weight_exp= 1.0,
+                 loss_weight_transducer: float = 0.005,
+                 loss_weight_lid: float = 1.0,
+                 lid_length: float = 3.0,
+                 ):
+
+        super().__init__()
+        self.hf_feats = hf_feats
+        if isinstance(transducer, dict):
+            transducer["decoder"]["in_feats"] = hf_feats.hidden_size
+            #transducer["joiner"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in transducer:
+                del transducer["class_name"]
+
+            transducer["encoder"] = None
+            transducer = RNNFiLMTransducer(**transducer)
+        else:
+            assert isinstance(transducer, RNNFiLMTransducer)
+            if transducer.encoder is None:
+                assert transducer.decoder.in_feats == hf_feats.hidden_size
+                #assert transducer.joiner.in_feats == hf_feats.hidden_size
+
+        if isinstance(languageid, dict):
+            languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
+            if "class_name" in languageid:
+                del languageid["class_name"]
+            languageid = ResNet1dLanguageID(**languageid)
+        else:
+            assert isinstance(languageid, ResNet1dLanguageID)
+            assert languageid.encoder_net.in_feats == hf_feats.hidden_size
+
+
+        self.transducer = transducer
+        self.languageid = languageid
+        self.feat_fusion_start_transducer = feat_fusion_start_transducer
+        self.feat_fusion_start_lid = feat_fusion_start_lid
+        self.feat_fusion_method_transducer = feat_fusion_method_transducer
+        self.feat_fusion_method_lid = feat_fusion_method_lid
+        self.loss_lid_type = loss_lid_type
+        self.loss_class_weight = loss_class_weight
+        self.loss_class_weight_exp = loss_class_weight_exp
+
+        if loss_lid_type == "CE" or loss_lid_type is None:
+            self.loss_lid = nn.CrossEntropyLoss()
+        elif loss_lid_type == "weightedCE":
+            self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp))
+            logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp))
+        elif loss_lid_type == "focal_loss":
+            self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True)
+
+        self.loss_weight_transducer = loss_weight_transducer
+        self.loss_weight_lid = loss_weight_lid
+        self.lid_length = lid_length
+        self._hf_context = contextlib.nullcontext()
+        self.transducer_fuser, self.films = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer)
+        self.languageid_fuser, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid)
+
+    def _make_fuser(self, method, start):
+        feat_fuser = None
+        films = None
+        if method == "last":
+            return feat_fuser, films
+        num_layers = self.hf_feats.num_encoder_layers + 1 - start
+        layer_dim = self.hf_feats.hidden_size
+        if method == "film-weighted-avg":
+            films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
+            feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif method == "film-fused-feature":
+            feat_fuser = nn.Parameter(torch.zeros(num_layers))
+            film = FiLM(layer_dim, self.transducer.decoder.condition_size)
+        elif method == "weighted-avg":
+            feat_fuser = nn.Parameter(torch.zeros(num_layers))
+        elif method == "linear":
+            feat_fuser = nn.Linear(num_layers, 1, bias=False)
+            feat_fuser.weight.data = torch.ones(1,
+                                                     num_layers) / num_layers
+        elif method == "cat":
+            feat_fuser = nn.Linear(num_layers * layer_dim,
+                                        layer_dim,
+                                        bias=False)
+
+        return feat_fuser, films
+
+    def _fuse_transducer_hid_feats(self, hid_feats, lang):
+        """Fuses the hidden features from the Wav2Vec model.
+
+        Args:
+          hid_feats: list of hidden features Tensors from Wav2Vec model.
+          lang: language id Tensor.
+
+        Returns:
+          Tensor of fused features (batch, channels, time)
+        """
+        if len(hid_feats) == 1:
+            # There is only one layer of features
+            return hid_feats[0]
+
+        lang_condition = self.transducer.decoder.lang_embedding(lang)
+        hid_feats = hid_feats[self.feat_fusion_start_transducer:]
+        if self.feat_fusion_method_transducer == "film-weighted-avg":
+            film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films)))
+            film_hid_feats = torch.stack(film_hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
+            feats = torch.sum(film_hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method_transducer == "film-fused-feature":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+            feats = self.films(feats, lang_condition)
+        elif self.feat_fusion_method_transducer == "weighted-avg":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method_transducer == "linear":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            feats = self.transducer_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method_transducer == "cat":
+            hid_feats = torch.cat(hid_feats, dim=-1)
+            feats = self.transducer_fuser(hid_feats)
+        elif self.feat_fusion_method_transducer == "last":
+            feats = hid_feats[-1]
+
+        return feats
+
+
+    def _fuse_lid_hid_feats(self, hid_feats):
+        """Fuses the hidden features from the Wav2Vec model.
+
+        Args:
+          hid_feats: list of hidden features Tensors from Wav2Vec model.
+
+        Returns:
+          Tensor of fused features (batch, channels, time)
+        """
+        if len(hid_feats) == 1:
+            # There is only one layer of features
+            return hid_feats[0]
+
+        hid_feats = hid_feats[self.feat_fusion_start_lid:]
+        if self.feat_fusion_method_lid == "weighted-avg":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            norm_weights = nn.functional.softmax(self.languageid_fuser, dim=-1)
+            feats = torch.sum(hid_feats * norm_weights, dim=-1)
+        elif self.feat_fusion_method_lid == "linear":
+            hid_feats = torch.stack(hid_feats, dim=-1)
+            feats = self.languageid_fuser(hid_feats).squeeze(dim=-1)
+        elif self.feat_fusion_method_lid == "cat":
+            hid_feats = torch.cat(hid_feats, dim=-1)
+            feats = self.languageid_fuser(hid_feats)
+        elif self.feat_fusion_method_lid == "last":
+            feats = hid_feats[-1]
+
+        return feats
+
+    def forward_lid_feats(self,
+                      x,
+                      x_lengths,
+                      lang=None,
+                      return_feat_layers=None,
+                      chunk_length=0,
+                      detach_chunks=False):
+        with self._hf_context:
+            hf_output = self.hf_feats(
+                x,
+                x_lengths,
+                return_hid_states=True,
+                chunk_length=chunk_length,
+                detach_chunks=detach_chunks,
+            )
+        feat_lengths = hf_output["hidden_states_lengths"]
+        
+        hid_feats = hf_output["hidden_states"]
+        feats = self._fuse_lid_hid_feats(hid_feats)
+        
+
+        feats = feats.transpose(1, 2)
+
+        return feats, hid_feats, feat_lengths
+            
+    def forward(
+        self,
+        x,
+        x_lengths=None,
+        text=None,
+        languageid=None,
+        return_feat_layers=None,
+        return_enc_layers=None,
+        return_classif_layers=None,
+        return_logits=True,
+    ):
+        """Forward function. If returns the logits posteriors of the classes.
+        It can also returns the hidden representations in the wav2vec feature extractor,
+        the x-vector encoder and the
+        classification head. In this case the ouput variable is a dictionary.
+
+        Args:
+          x: input features tensor with shape=(batch, in_feats, time)
+          x_lengths: time lengths of the features with shape=(batch,)
+          y: target classes torch.long tensor with shape=(batch,)
+          return_feat_layers: list of integers indicating, which wav2vec layers
+                             we should return. If None, no wav2vec layers are returned.
+          return_enc_layers: list of integers indicating, which encoder layers
+                             we should return. If None, no encoder layers are returned.
+          return_logits: if True, it adds the logits to the output dictionary.
+        Returns:
+          Dataclass with losses, "h_enc" (list of hidden encoder layers),
+          "h_feats" (wav2vec features)
+        """
+        feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats(
+            x, x_lengths, return_feat_layers)
+
+        lid_len = int(self.lid_length * 50)
+        min_len = torch.min(feat_lengths).item()
+        if min_len > lid_len:
+            lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item()
+            feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len]
+
+
+        lid_logits = self.languageid(
+            feats_languageid,
+            None,
+            languageid,
+            return_enc_layers=return_enc_layers,
+            return_classif_layers=return_classif_layers,
+            return_logits=return_logits,
+        )
+
+        loss_lid = self.loss_lid(lid_logits, languageid)
+        
+
+        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C)
+            
+        trans_output = self.transducer(
+            feats_transducer,
+            feat_lengths,
+            text,
+            lid_logits
+        )
+
+        if return_feat_layers:
+            trans_output.h_feats = [
+                f.transpose(1, 2) for i, f in enumerate(hid_feats)
+                if i in return_feat_layers
+            ]
+        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, 
+                                                loss_transducer=trans_output.loss, 
+                                                loss_lid=loss_lid,
+                                                loss_transducer_simple=trans_output.loss_simple, 
+                                                loss_transducer_pruned=trans_output.loss_pruned,
+                                                h_feats=trans_output.h_feats,
+                                                logits=lid_logits if return_logits else None)
+        return output
+
+    def infer(self,
+              x: torch.Tensor,
+              x_lengths: torch.Tensor,
+              decoding_method="time_sync_beam_search",
+              beam_width: int = 5,
+              max_sym_per_frame: int = 3,
+              max_sym_per_utt: int = 1000):
+        """
+        ASR tokens inference
+        Args:
+          x: input features with shape = (N, T, C)
+          x_lengths: feature number for frames with shape = (N,)
+          decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search
+          max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame.
+          max_sym_per_utt: maximimum number of symbols in a single utterance.
+        Returns:
+          List of list of integer indexes of the recognizer's symbols.
+        """
+
+
+        feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats(
+            x, x_lengths, return_feat_layers)
+
+
+        lid = self.languageid(
+            feats_languageid.float(),
+            feat_lengths,
+            None,
+            return_enc_layers=None,
+            return_classif_layers=None,
+            return_logits=True,
+        )
+        
+        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid) # (N, T, C)
+            
+
+        text = self.transducer.infer(feats_transducer,
+                                  feat_lengths,
+                                  decoding_method=decoding_method,
+                                  beam_width=beam_width,
+                                  max_sym_per_frame=max_sym_per_frame,
+                                  max_sym_per_utt=max_sym_per_utt)
+                                  
+        return text, lid
+
+    # def freeze_feat_fuser(self):
+    #     if self.feat_fuser is None:
+    #         return
+
+    #     if self.feat_fusion_method_transducer == "weighted-avg":
+    #         self.feat_fuser.requires_grad = False
+    #         return
+
+    #     for param in self.feat_fuser.parameters():
+    #         param.requires_grad = False
+
+    def freeze_hf_feats(self):
+        self.hf_feats.freeze()
+
+    def freeze_hf_feature_encoder(self):
+        self.hf_feats.freeze_feature_encoder()
+
+    def set_train_mode(self, mode):
+        if mode == self._train_mode:
+            return
+
+        if mode == "full":
+            self.unfreeze()
+        elif mode == "frozen":
+            self.freeze()
+        elif mode in ["ft-transducer", "ft-transducer-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+            self.freeze_feat_fuser()
+        elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
+            self.unfreeze()
+            self.freeze_hf_feats()
+        elif mode == "hf-feat-extractor-frozen":
+            self.unfreeze()
+            self.freeze_hf_feature_encoder()
+        else:
+            raise ValueError(f"invalid train_mode={mode}")
+
+        logging.info("train mode set to %s", mode)
+
+        if "nograd" in mode:
+            logging.info("using torch.no_grad for hf_feats")
+            self._hf_context = torch.no_grad()
+        else:
+            self._hf_context = contextlib.nullcontext()
+
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+
+        if train_mode in ["full", "frozen"]:
+            super()._train(train_mode)
+        elif train_mode in [
+                "ft-transducer",
+                "hf-feats-frozen",
+                "ft-transducer-nograd",
+                "hf-feats-frozen-nograd",
+                "hf-feat-extractor-frozen",
+        ]:
+            self.hf_feats.train()
+            self.transducer._train("full")
+        else:
+            raise ValueError(f"invalid train_mode={train_mode}")
+
+    @staticmethod
+    def valid_train_modes():
+        return [
+            "full",
+            "frozen",
+            "ft-embed-affine",
+            "ft-transducer",
+            "hf-feats-frozen",
+            "ft-transducer-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
+        ]
+
+    @staticmethod
+    def filter_args(**kwargs):
+        valid_args = (
+            "hf_feats",
+            "transducer",
+            "feat_fusion_start_transducer",
+            "feat_fusion_start_lid",
+            "feat_fusion_method_transducer",
+            "feat_fusion_method_lid",
+            "loss_lid_type",
+            "loss_class_weight",
+            "loss_class_weight_exp",
+            "loss_weight_transducer",
+            "loss_weight_lid",
+            "languageid",
+        )
+        args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+        return args
+
+    def get_config(self):
+        hf_cfg = self.hf_feats.get_config()
+        tran_cfg = self.transducer.get_config()
+        lid_cfg = self.languageid.get_config()
+        del hf_cfg["class_name"]
+        del tran_cfg["class_name"]
+        del lid_cfg["class_name"]
+        config = {
+            "hf_feats": hf_cfg,
+            "transducer": tran_cfg,
+            "languageid": lid_cfg,
+            "feat_fusion_start_transducer": self.feat_fusion_start_transducer,
+            "feat_fusion_start_lid": self.feat_fusion_start_lid,
+            "feat_fusion_method_transducer": self.feat_fusion_method_transducer,
+            "feat_fusion_method_lid": self.feat_fusion_method_lid,
+            "loss_lid_type": self.loss_lid_type,
+            "loss_class_weight": self.loss_class_weight,
+            "loss_class_weight_exp": self.loss_class_weight_exp,
+            "loss_weight_transducer": self.loss_weight_transducer,
+            "loss_weight_lid": self.loss_weight_lid,
+            "lid_length": self.lid_length,
+        }
+
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def change_config(self, hf_feats, transducer, languageid):
+        logging.info("changing hf wav2transducer config")
+        self.hf_feats.change_config(**hf_feats)
+        self.transducer.change_config(**transducer)
+        self.languageid.change_config(**languageid)
+
+    @staticmethod
+    def add_class_args(parser, prefix=None, skip=set()):
+
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--feat-fusion-start-transducer",
+            default=0,
+            type=int,
+            help="""
+            the input to transducer model will fuse the wav2vec 
+            layers from feat_fusion_start_transducer to
+            the wav2vec num_layers""",
+        )
+        parser.add_argument(
+            "--feat-fusion-start-lid",
+            default=0,
+            type=int,
+            help="""
+            the input to lid model will fuse the wav2vec 
+            layers from feat_fusion_start_lid to
+            the wav2vec num_layers""",
+        )
+
+        parser.add_argument(
+            "--feat-fusion-method-transducer",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=("method to fuse the hidden layers from the wav2vec model "
+                  "in [weighted-avg, linear, cat, last]"),
+        )
+        parser.add_argument(
+            "--feat-fusion-method-lid",
+            default="weighted-avg",
+            choices=["weighted-avg", "linear", "cat", "last"],
+            help=("method to fuse the hidden layers from the wav2vec model "
+                  "in [weighted-avg, linear, cat, last]"),
+        )
+
+        parser.add_argument(
+            "--loss-lid-type",
+            default="weightedCE",
+            choices=["CE", "weightedCE", "focal_loss"],
+            help=("loss type for language identification"),
+        )
+        parser.add_argument(
+            "--loss-class-weight",
+            default=None,
+            type=str,
+            help=("class weight for language identification"),
+        )
+        parser.add_argument(
+            "--loss-class-weight-exp",
+            default=1.0,
+            type=float,
+            help=("class weight exponent for language identification"),
+        )
+        parser.add_argument(
+            "--loss-weight-transducer",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the transducer loss
+            """,
+        )
+
+        parser.add_argument(
+            "--loss-weight-lid",
+            default=1.0,
+            type=float,
+            help="""
+            The weight of the lid loss
+            """,
+        )
+
+        parser.add_argument(
+            "--lid-length",
+            default=3.0,
+            type=float,
+            help="""
+            The length of the chunks for language id
+            """,
+        )
+
+        if prefix is not None:
+            outer_parser.add_argument(
+                "--" + prefix,
+                action=ActionParser(parser=parser),
+            )
+
+    @staticmethod
+    def add_infer_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        RNNFiLMTransducer.add_infer_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_infer_args(**kwargs):
+        return RNNFiLMTransducer.filter_infer_args(**kwargs)
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
new file mode 100644
index 00000000..e012f17a
--- /dev/null
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
@@ -0,0 +1,171 @@
+"""
+ Copyright 2022 Johns Hopkins University  (Author: Yen-Ju Lu)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
+
+from ...tpm import HFWav2Vec2
+from ..transducer import RNNFiLMTransducer
+from ..xvectors import ResNet1dXVector as ResNet1dLanguageID
+from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID
+from ..wav2transducer import HFWav2Vec2RNNFiLMTransducer
+
+
+from .hf_wav2rnn_film_transducer_languageid import HFWav2RNNFiLMTransducerLanguageID
+
+
+class HFWav2Vec2RNNFiLMTransducerResnet1D(HFWav2RNNFiLMTransducerLanguageID):
+    """Class for RNN-T with Wav2Vec2 features
+
+    Attributes:
+      Attributes:
+      hf_feats: HFWav2Vec configuration dictionary or object.
+                This is a warpper over Hugging Face Wav2Vec model.
+      transducer: Transducer configuration dictionary or object.
+      feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to
+                         the wav2vec "num_layers".
+      feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more
+                           than one layer is used.
+    """
+
+    def __init__(
+        self,
+        hf_feats: Union[Dict, HFWav2Vec2],
+        transducer: Union[Dict, RNNFiLMTransducer],
+        languageid: Union[Dict, ResNet1dLanguageID],
+        feat_fusion_start_transducer: int = 0,
+        feat_fusion_start_lid: int = 0,
+        feat_fusion_method_transducer: str = "weighted-avg",
+        feat_fusion_method_lid: str = "weighted-avg",
+        loss_lid_type: str = "weightedCE",
+        loss_class_weight: Optional[torch.Tensor] = None,
+        loss_class_weight_exp: float = 1.0,
+        loss_weight_transducer: float = 0.005,
+        loss_weight_lid: float = 1.0,
+        lid_length: float = 3.0,
+    ):
+
+        if isinstance(hf_feats, dict):
+            if "class_name" in hf_feats:
+                del hf_feats["class_name"]
+            hf_feats = HFWav2Vec2(**hf_feats)
+        else:
+            assert isinstance(hf_feats, HFWav2Vec2)
+
+        # if isinstance(languageid, dict):
+        #     languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size
+        #     if "class_name" in languageid:
+        #         del languageid["class_name"]
+        #     languageid = ResNet1dLanguageID(**languageid)
+        # else:
+        #     assert isinstance(languageid, ResNet1dLanguageID)
+        #     assert languageid.encoder_net.in_feats == hf_feats.hidden_size
+
+        # hf_feats = wav2transducer.hf_feats
+        # transducer = wav2transducer.transducer
+        # languageid = wav2languageid.languageid
+
+
+        super().__init__(hf_feats, transducer, languageid, 
+                        feat_fusion_start_transducer=feat_fusion_start_transducer,
+                        feat_fusion_start_lid=feat_fusion_start_lid,
+                        feat_fusion_method_transducer=feat_fusion_method_transducer,
+                        feat_fusion_method_lid=feat_fusion_method_lid,
+                        loss_lid_type=loss_lid_type,
+                        loss_class_weight=loss_class_weight,
+                        loss_class_weight_exp=loss_class_weight_exp,
+                        loss_weight_transducer=loss_weight_transducer,
+                        loss_weight_lid=loss_weight_lid,
+                        lid_length=lid_length)
+                            
+                            
+
+    @staticmethod
+    def filter_args(**kwargs):
+        base_args = HFWav2RNNFiLMTransducerLanguageID.filter_args(**kwargs)
+        child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"])
+        base_args["transducer"] = child_args
+        child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"])
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_class_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        HFWav2Vec2.add_class_args(parser, prefix="hf_feats")
+        RNNFiLMTransducer.add_class_args(parser, prefix="transducer")
+        # HFWav2RNNFiLMTransducer.add_class_args(parser)
+        ResNet1dLanguageID.add_class_args(parser, prefix="languageid")
+        HFWav2RNNFiLMTransducerLanguageID.add_class_args(parser)
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+
+        valid_args = (
+            "loss_weight_transducer",
+            "loss_weight_lid",
+            "lid_length",
+        )
+        child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
+        base_args["hf_feats"] = child_args
+        child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"])
+        base_args["transducer"] = child_args
+        child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"])
+        base_args["languageid"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        parser.add_argument(
+            "--loss-weight-transducer",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the transducer loss
+            """,
+        )
+
+        parser.add_argument(
+            "--loss-weight-lid",
+            default=1.0,
+            type=float,
+            help="""
+            The weight of the lid loss
+            """,
+        )
+
+        parser.add_argument(
+            "--lid-length",
+            default=3.0,
+            type=float,
+            help="""
+            The length of the chunks for language id
+            """,
+        )
+
+        HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats")
+        RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer")
+        ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix,
+                                      action=ActionParser(parser=parser))

From 9e59d74fdd3b6ee36572286aa4f38637a8bb0c8e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Fri, 9 Jun 2023 04:03:21 +0000
Subject: [PATCH 51/89] add activation option for film

---
 hyperion/torch/layer_blocks/film_blocks.py    | 24 +++++++++++++++----
 .../layer_blocks/transducer_film_joiner.py    |  5 ++--
 .../layer_blocks/transducer_film_predictor.py |  8 +++++--
 .../hf_wav2rnn_film_transducer.py             |  6 ++---
 .../narchs/rnn_film_transducer_decoder.py     | 24 ++++++++++++++-----
 5 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
index 9503fcfe..7d22416f 100644
--- a/hyperion/torch/layer_blocks/film_blocks.py
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -2,10 +2,20 @@
 import torch.nn as nn
 
 class FiLM(nn.Module):
-    def __init__(self, input_size, condition_size):
+    def __init__(self, input_size, condition_size, film_type="linear"):
         # condition_size: the size of the language id vector
         # input_size: the size of the RNN input to the FiLM layer
         super(FiLM, self).__init__()
+        # if film_type == "tanh":
+        #     self.linear_scale = nn.Sequential(
+        #         nn.Linear(condition_size, input_size),
+        #         nn.Tanh()
+        #     )
+        #     self.linear_shift = nn.Sequential(
+        #         nn.Linear(condition_size, input_size),
+        #         nn.Tanh()
+        #     )
+        # elif film_type == "linear":
         self.linear_scale = nn.Linear(condition_size, input_size)
         self.linear_shift = nn.Linear(condition_size, input_size)
 
@@ -24,7 +34,7 @@ def forward(self, x, lang_condition):
 
 
 class RNNWithFiLM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm"):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh"):
         super(RNNWithFiLM, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -32,11 +42,14 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size,
         self.dropout = dropout
         self.batch_first = batch_first 
         self.rnn_type = rnn_type
+        self.film_type = film_type
         if self.rnn_type == "lstm":
             self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
         elif self.rnn_type == "gru":
             self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
-        self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
+
+        self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)])
+
         self.dropout_layer = nn.Dropout(dropout)
 
     def forward(self, x, states, lang_condition):
@@ -64,7 +77,7 @@ def forward(self, x, states, lang_condition):
 
 
 class RNNWithFiLMResidual(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual"):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual", film_type="linear"):
         super(RNNWithFiLMResidual, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -76,7 +89,8 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size,
             self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
         elif self.rnn_type == "gru_residual":
             self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
-        self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)])
+        self.film_type = film_type
+        self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type)  for _ in range(num_layers)])
         self.dropout_layer = nn.Dropout(dropout)
 
     def forward(self, x, states, lang_condition):
diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py
index 7fdae60d..02a9dfdf 100644
--- a/hyperion/torch/layer_blocks/transducer_film_joiner.py
+++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py
@@ -21,7 +21,7 @@ class TransducerFiLMJoiner(nn.Module):
       vocab_size: vocabulary size
     """
 
-    def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int):
+    def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear"):
         
         super().__init__()
         self.enc_feats = enc_feats
@@ -32,8 +32,7 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size:
         self.enc_proj = nn.Linear(enc_feats, hid_feats)
         self.pred_proj = nn.Linear(pred_feats, hid_feats)
         self.output = nn.Linear(hid_feats, vocab_size)
-
-        self.film = FiLM(hid_feats, condition_size)
+        self.film = FiLM(hid_feats, condition_size, film_type)
         
     def get_config(self):
         config = {
diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py
index cb628a2c..dc7a7ae4 100644
--- a/hyperion/torch/layer_blocks/transducer_film_predictor.py
+++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py
@@ -38,6 +38,7 @@ def __init__(self,
                  embed_dropout_rate: float = 0.0,
                  rnn_dropout_rate: float = 0.0,
                  rnn_type: str = "lstm",
+                 film_type: str = "linear",
                  blank_id: int = 0):
         super().__init__()
         self.embedding = nn.Embedding(
@@ -54,7 +55,8 @@ def __init__(self,
                 dropout=rnn_dropout_rate,
                 condition_size=condition_size,
                 batch_first=True,
-                rnn_type=rnn_type
+                rnn_type=rnn_type,
+                film_type=film_type
             )
         elif rnn_type in ["lstm_residual","gru_residual"]:
             self.rnn = RNNWithFiLMResidual(
@@ -64,7 +66,8 @@ def __init__(self,
                 dropout=rnn_dropout_rate,
                 condition_size=condition_size,
                 batch_first=True,
-                rnn_type=rnn_type
+                rnn_type=rnn_type,
+                film_type=film_type
             )
         else:
             raise Exception(f"Unknown RNN type {rnn_type}")
@@ -97,6 +100,7 @@ def get_config(self):
             "embed_dropout_rate": self.embed_dropout_rate,
             "rnn_dropout_rate": self.rnn_dropout_rate,
             "rnn_type": self.rnn_type,
+            "film_type": self.film_type,
             "blank_id": self.blank_id,
         }
         return config
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 24efb44e..84f2239c 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -65,11 +65,11 @@ def _make_fuser(self):
         num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
         layer_dim = self.hf_feats.hidden_size
         if self.feat_fusion_method == "film-weighted-avg":
-            self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
+            self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type) for _ in range(num_layers)])
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif self.feat_fusion_method == "film-fused-feature":
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
-            self.film = FiLM(layer_dim, self.transducer.decoder.condition_size)
+            self.film = FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type)
         elif self.feat_fusion_method == "weighted-avg":
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif self.feat_fusion_method == "linear":
@@ -251,7 +251,7 @@ def freeze_feat_fuser(self):
         if self.feat_fuser is None:
             return
 
-        if self.feat_fusion_method == "weighted-avg":
+        if self.feat_fusion_method in ["weighted-avg", "film-weighted-avg", "film-fused-feature"]:
             self.feat_fuser.requires_grad = False
             return
 
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 1ccac6a9..6a5c0845 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -78,7 +78,8 @@ def __init__(
         pruned_warmup_steps: int = 2000,
         langs_size: int = 13,
         condition_size: int = 64,
-        film_type: str = "one-hot",
+        film_cond_type: str = "one-hot",
+        film_type: str = "linear",
     ):
 
         super().__init__()
@@ -97,15 +98,16 @@ def __init__(
         self.simple_loss_scale = simple_loss_scale
         self.pruned_warmup_steps = pruned_warmup_steps
         self.condition_size = condition_size
+        self.film_cond_type = film_cond_type
         self.film_type = film_type
 
 
         self._make_predictor()
         self._make_joiner()
         # make embedding layer for language id
-        if self.film_type == "one-hot":
+        if self.film_cond_type == "one-hot":
             self.lang_embedding = nn.Embedding(langs_size, condition_size)
-        elif self.film_type == "lid_pred":
+        elif self.film_cond_type == "lid_pred":
             self.lang_embedding = nn.Linear(langs_size, condition_size)
         if self.rnnt_loss == "k2_pruned":
             self.simple_am_proj = nn.Linear(in_feats, vocab_size)
@@ -140,7 +142,7 @@ def _make_joiner(self):
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
             self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats,
-                                 self.vocab_size, self.condition_size)
+                                 self.vocab_size, self.condition_size, self.film_type)
         elif joiner_type == "original_joiner":
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
@@ -166,6 +168,7 @@ def get_config(self):
             "simple_loss_scale": self.simple_loss_scale,
             "pruned_warmup_steps": self.pruned_warmup_steps,
             "condition_size": self.condition_size,
+            "film_cond_type": self.film_cond_type,
             "film_type": self.film_type,
         }
         base_config = super().get_config()
@@ -722,6 +725,12 @@ def add_pred_args(parser):
             help=
             """type of recurrent network for thep predictor in [lstm, gru]""")
 
+        pred_parser.add_argument("--film-type",
+                                    default="linear",
+                                    choices=["linear", "tanh"],
+                                    help=("type of the FiLM layer"))
+
+
         pred_parser.add_argument("--num-layers",
                                  default=2,
                                  type=int,
@@ -822,12 +831,15 @@ def add_class_args(parser,
                             required=True,
                             help=("condition vector dimension"))
                             
-        parser.add_argument("--film-type",
+        parser.add_argument("--film-cond-type",
                             default="one-hot",
                             choices=["one-hot", "lid_pred"],
                             help=("type of the condition of FiLM layer"))
 
-
+        parser.add_argument("--film-type",
+                            default="linear",
+                            choices=["linear", "tanh"],
+                            help=("type of the FiLM layer"))
         parser.add_argument(
             "--lm-scale",
             default=0.25,

From 1f56469f639ecabc80de1bd57f8d66d70d236809 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Fri, 9 Jun 2023 04:06:52 +0000
Subject: [PATCH 52/89] add configuration

---
 ...ase_rnnt_film_k2_pruned_stage3_v4.2.1.yaml | 92 +++++++++++++++++++
 ...ase_rnnt_film_k2_pruned_stage4_v4.2.1.yaml | 76 +++++++++++++++
 ...pruned_filmed_transducer_v4.2.1_13langs.sh | 45 +++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml
new file mode 100644
index 00000000..d6c995e8
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
new file mode 100644
index 00000000..aaf5dedb
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
@@ -0,0 +1,76 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15
+      max_audio_length: 12.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15
+      max_audio_length: 12.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh
new file mode 100644
index 00000000..d209d421
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2.1_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0012.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.1.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From 35cb6f3b7c043a97ca1952b6dc437df85ddbc20a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Sun, 11 Jun 2023 13:34:40 +0000
Subject: [PATCH 53/89] add config for film-asr-lid model

---
 ...uned_filmed_transducer_lid_v1.0_13langs.sh | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh

diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh
new file mode 100644
index 00000000..8d9e95d3
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh
@@ -0,0 +1,43 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From 250bacf66edd358dad77751c6b3010f720b0a919 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Tue, 13 Jun 2023 07:21:15 +0000
Subject: [PATCH 54/89] add new configs for film model

---
 ...2base_rnnt_film_k2_pruned_stage3_v4.3.yaml | 92 ++++++++++++++++++
 ...2base_rnnt_film_k2_pruned_stage3_v4.4.yaml | 94 +++++++++++++++++++
 hyperion/torch/layer_blocks/film_blocks.py    | 24 ++---
 hyperion/torch/trainers/torch_trainer.py      |  3 +
 4 files changed, 201 insertions(+), 12 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml
new file mode 100644
index 00000000..e436c876
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-fused-feature
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
new file mode 100644
index 00000000..72a4c6a6
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
@@ -0,0 +1,94 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      film_type: tanh
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        film_type: tanh
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
index 7d22416f..00ee1a10 100644
--- a/hyperion/torch/layer_blocks/film_blocks.py
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -6,18 +6,18 @@ def __init__(self, input_size, condition_size, film_type="linear"):
         # condition_size: the size of the language id vector
         # input_size: the size of the RNN input to the FiLM layer
         super(FiLM, self).__init__()
-        # if film_type == "tanh":
-        #     self.linear_scale = nn.Sequential(
-        #         nn.Linear(condition_size, input_size),
-        #         nn.Tanh()
-        #     )
-        #     self.linear_shift = nn.Sequential(
-        #         nn.Linear(condition_size, input_size),
-        #         nn.Tanh()
-        #     )
-        # elif film_type == "linear":
-        self.linear_scale = nn.Linear(condition_size, input_size)
-        self.linear_shift = nn.Linear(condition_size, input_size)
+        if film_type == "tanh":
+            self.linear_scale = nn.Sequential(
+                nn.Linear(condition_size, input_size),
+                nn.Tanh()
+            )
+            self.linear_shift = nn.Sequential(
+                nn.Linear(condition_size, input_size),
+                nn.Tanh()
+            )
+        elif film_type == "linear":
+            self.linear_scale = nn.Linear(condition_size, input_size)
+            self.linear_shift = nn.Linear(condition_size, input_size)
 
     def forward(self, x, lang_condition):
         # import pdb; pdb.set_trace()
diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py
index 7ae7c50e..f98ff2b9 100644
--- a/hyperion/torch/trainers/torch_trainer.py
+++ b/hyperion/torch/trainers/torch_trainer.py
@@ -245,6 +245,9 @@ def fit(self, train_data, val_data=None):
 
         val_logs = {}
         self.loggers.on_train_begin(epochs=self.epochs)
+        if self.cur_epoch == 0:
+            self.save_checkpoint()
+        # exit()
         for epoch in range(self.cur_epoch, self.epochs):
             self.set_epoch(train_data)
             self.loggers.on_epoch_begin(epoch, batches=len(train_data))

From 410100c482fb4b1a4b8f98bc44b5e0da57f58f7c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Tue, 13 Jun 2023 07:27:08 +0000
Subject: [PATCH 55/89] update config

---
 ...g_pruned_filmed_transducer_v4.3_13langs.sh | 45 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v4.4_13langs.sh | 45 +++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh

diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh
new file mode 100644
index 00000000..0134e84f
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.3_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh
new file mode 100644
index 00000000..99b5d16c
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.4_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.4.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From 2a8c3c4c81d37ac3e34ee5a1098553b63abfc0e5 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-68-209.ec2.internal>
Date: Sun, 18 Jun 2023 11:35:28 +0000
Subject: [PATCH 56/89] update config for film ASR

---
 ...c2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml | 65 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v5.1_13langs.sh | 45 +++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml
new file mode 100644
index 00000000..a15272d4
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml
@@ -0,0 +1,65 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 128
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn1024x3_do0.5.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.15
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    #decay_steps: 4200
+    #hold_steps: 1500
+    decay_steps: 16000
+    hold_steps: 32000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 100
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
new file mode 100644
index 00000000..ab3d1ec8
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.1_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0008.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v5.1.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From 19078699acbf5eb48dfa34aaebb5ed7eea8a58f0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-79-189.ec2.internal>
Date: Sun, 18 Jun 2023 17:12:32 +0000
Subject: [PATCH 57/89] add config for film ASR

---
 ...2base_rnnt_film_k2_pruned_stage3_v5.1.yaml | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
new file mode 100644
index 00000000..8947cfd0
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-fused-feature
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.8
+    decay_steps: 45000
+    hold_steps: 90000
+    min_lr: 4e-5
+    warmup_steps: 3000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 

From 87822f6d5522ab51ee58fa464b97403711df63be Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-67-137.ec2.internal>
Date: Fri, 23 Jun 2023 02:03:11 +0000
Subject: [PATCH 58/89] add bias initialization

---
 ...2base_rnnt_film_k2_pruned_stage3_v5.6.yaml | 92 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v5.6_13langs.sh | 45 +++++++++
 .../v1/local/initailize_film_model_bias.py    | 67 ++++++++++++++
 3 files changed, 204 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh
 create mode 100644 egs/commonvoice/v1/local/initailize_film_model_bias.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml
new file mode 100644
index 00000000..a3f25ffd
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-fused-feature
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.8
+    decay_steps: 45000
+    hold_steps: 40000
+    min_lr: 4e-5
+    warmup_steps: 3000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh
new file mode 100644
index 00000000..f0db5fb6
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.6_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.6.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v5.6.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/local/initailize_film_model_bias.py b/egs/commonvoice/v1/local/initailize_film_model_bias.py
new file mode 100644
index 00000000..6abedf57
--- /dev/null
+++ b/egs/commonvoice/v1/local/initailize_film_model_bias.py
@@ -0,0 +1,67 @@
+import torch
+import sys
+
+# arguments example
+# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth'
+# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth"
+# output_model = "model_initialized.pth"
+
+pretrained_model = torch.load(sys.argv[1])
+film_model = torch.load(sys.argv[2])
+
+output_model = sys.argv[3]
+
+
+def update_film_lstm_parameters(film_state_dict, pretrained_state_dict):
+    for i in range(2):
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone()
+        film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone()
+    return film_state_dict
+
+def copy_model_parameters(pretrained_model, film_model):
+    pretrained_state_dict = pretrained_model["model_state_dict"]
+    film_state_dict = film_model["model_state_dict"]
+    update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape}
+
+    film_update_state_dict = {}
+    for name, param in film_state_dict.items():
+        if "linear_scale.weight" in name:
+            film_update_state_dict[name] = torch.zeros_like(param)
+        elif "linear_scale.bias" in name:
+            film_update_state_dict[name] = torch.ones_like(param)
+        elif "linear_shift.weight" in name or "linear_shift.bias" in name:
+            film_update_state_dict[name] = torch.zeros_like(param)
+    # import pdb; pdb.set_trace()
+    new_film_state_dict = film_state_dict.copy()
+    new_film_state_dict.update(update_state_dict)
+    new_film_state_dict.update(film_update_state_dict)
+
+
+    new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict)
+
+    film_model["model_state_dict"] = new_film_state_dict
+
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in film_state_dict.items():
+        if torch.all(torch.eq(param, new_film_state_dict[name])):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+
+    for name, param in pretrained_state_dict.items():
+        if name not in changed_parameters:
+            unloaded_parameters.append(name)
+
+    print(f"Unchanged parameters: {unchanged_parameters}")
+    print(f"Unloaded parameters: {unloaded_parameters}")
+    print(f"Changed parameters: {changed_parameters}")
+    film_model["epoch"] =1
+    torch.save(film_model, output_model)
+
+
+
+unchanged_parameters = copy_model_parameters(pretrained_model, film_model)
\ No newline at end of file

From 27eea766c242253946179401c9dfc7ac2092313a Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.rockfish.cluster>
Date: Sat, 24 Jun 2023 03:27:20 -0400
Subject: [PATCH 59/89] add new config

---
 ...2base_rnnt_film_k2_pruned_stage3_v6.0.yaml | 92 +++++++++++++++++++
 ...g_pruned_filmed_transducer_v6.0_13langs.sh | 45 +++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
new file mode 100644
index 00000000..4a72296d
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-fused-feature
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 4500
+    hold_steps: 4000
+    min_lr: 4e-5
+    warmup_steps: 3000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
new file mode 100644
index 00000000..71d38168
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
@@ -0,0 +1,45 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v6.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From f809855cb14126a714fba03d7df15cef9f799c0e Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-78-22.ec2.internal>
Date: Wed, 28 Jun 2023 10:10:27 +0000
Subject: [PATCH 60/89] first config for film-lid model

---
 ...ucer_ecapadnn512x3_1layer_stage1_v1.0.yaml | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml
new file mode 100644
index 00000000..da03a499
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml
@@ -0,0 +1,139 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 15.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 8
+model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m   
+  transducer:
+    decoder:
+      film_cond_type: lid_pred
+      reduction: mean
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm_residual
+      joiner:
+        hid_feats: 512
+  languageid:
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+      resb_channels:
+        - 512
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+      dropout_rate: 0.2
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: arc-softmax
+    cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 0.1
+  loss_weight_lid: 1.0
+  lid_length: 3.0
+  
+  feat_fusion_method_transducer: film-weighted-avg
+  feat_fusion_method_lid: weighted-avg
+  feat_fusion_start_transducer: 2
+  feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 180000
+    hold_steps: 60000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+ 
\ No newline at end of file

From 8c73fa4b388b4a8597b6002c89f3fdd4a5dcd543 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-76-29.ec2.internal>
Date: Wed, 28 Jun 2023 10:28:45 +0000
Subject: [PATCH 61/89] add run script for film-asr-lid

---
 .../v1/run_025_train_film_asr_lid.sh          | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100755 egs/commonvoice/v1/run_025_train_film_asr_lid.sh

diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
new file mode 100755
index 00000000..8b213cfe
--- /dev/null
+++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# Copyright
+#                2022   Johns Hopkins University (Author: Yen-Ju Lu)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+# export CUDA_VISIBLE_DEVICES=0
+
+#ml purge
+#module load namd/2.14-cuda-smp
+#module load cuda/11.6.0
+#ml
+#nvidia-smi
+#export CUDA_VISIBLE_DEVICES=0,1,2,3
+#export CONV_RSH=ssh
+#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
+
+
+stage=1
+ngpu=4
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_dir=data/${nnet_data}/
+val_dir=data/${dev_data}/
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+    extra_args="--data.val.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+if [ "$use_wandb" == "true" ];then
+  extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)"
+fi
+
+
+# Network Training
+if [ $stage -le 1 ]; then
+
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
+    train_wav2vec2rnn_film_transducer_languageid.py $nnet_type \
+    --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s1_dir $args \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1238 \
+    --num-gpus $ngpu
+
+fi
+
+if [ $stage -le 2 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer_languageid.py $nnet_type \
+    --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s2_dir $args \
+    --in-model-transducer $nnet_transducer \
+    --in-model-lid $nnet_lid \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+  
+fi
+
+if [ $stage -le 3 ]; then
+
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  
+
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2vec2transducer.py $nnet_type \
+    --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.bpe-model $bpe_model \
+    --data.train.dataset.text-file $train_dir/text \
+    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.text-file $val_dir/text \
+    --trainer.exp-path $nnet_s3_dir $args \
+    --in-model-file $nnet_s2 \
+    --data.train.dataset.time-durs-file $train_dir/utt2dur \
+    --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --num-gpus $ngpu
+fi
+

From 982499e4de0b7de9cd8bc9c60f8acaebf77876b8 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Fri, 30 Jun 2023 06:49:38 -0400
Subject: [PATCH 62/89] update joint-training code

---
 ...ucer_ecapadnn512x3_1layer_stage1_v2.0.yaml | 139 ++++++++++++++++++
 ...uned_filmed_transducer_lid_v2.0_13langs.sh |  43 ++++++
 .../hf_wav2rnn_film_transducer_languageid.py  |  35 ++++-
 .../narchs/rnn_film_transducer_decoder.py     |  11 +-
 4 files changed, 217 insertions(+), 11 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
new file mode 100644
index 00000000..0931c052
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
@@ -0,0 +1,139 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m   
+  transducer:
+    decoder:
+      film_cond_type: lid_pred_embed
+      reduction: mean
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  languageid:
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+      resb_channels:
+        - 512
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+      dropout_rate: 0.2
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: arc-softmax
+    cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  lid_length: 3.0
+  
+  feat_fusion_method_transducer: film-fused-feature
+  feat_fusion_method_lid: weighted-avg
+  feat_fusion_start_transducer: 2
+  feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 9000
+    hold_steps: 6000
+    min_lr: 4e-5
+    warmup_steps: 3000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh
new file mode 100644
index 00000000..6fe79ec1
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh
@@ -0,0 +1,43 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
index d967702a..8e29bc84 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -140,7 +140,8 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang):
             # There is only one layer of features
             return hid_feats[0]
 
-        lang_condition = self.transducer.decoder.lang_embedding(lang)
+        if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]:
+            lang_condition = self.transducer.decoder.lang_embedding(lang)
         hid_feats = hid_feats[self.feat_fusion_start_transducer:]
         if self.feat_fusion_method_transducer == "film-weighted-avg":
             film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films)))
@@ -230,7 +231,7 @@ def forward(
         languageid=None,
         return_feat_layers=None,
         return_enc_layers=None,
-        return_classif_layers=None,
+        return_classif_layers=[0],
         return_logits=True,
     ):
         """Forward function. If returns the logits posteriors of the classes.
@@ -261,7 +262,7 @@ def forward(
             feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len]
 
 
-        lid_logits = self.languageid(
+        output = self.languageid(
             feats_languageid,
             None,
             languageid,
@@ -269,17 +270,21 @@ def forward(
             return_classif_layers=return_classif_layers,
             return_logits=return_logits,
         )
+        # output["h_classif"] = h_classif
+        # output["logits"] = y_pred
 
-        loss_lid = self.loss_lid(lid_logits, languageid)
+        #loss_lid = self.loss_lid(lid_logits, languageid)
+        loss_lid = self.loss_lid(output["logits"], languageid)
         
-
-        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C)
+        # feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C)
+        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"]) # (N, T, C)
             
         trans_output = self.transducer(
             feats_transducer,
             feat_lengths,
             text,
-            lid_logits
+            output["h_classif"]
+            # lid_logits
         )
 
         if return_feat_layers:
@@ -293,7 +298,8 @@ def forward(
                                                 loss_transducer_simple=trans_output.loss_simple, 
                                                 loss_transducer_pruned=trans_output.loss_pruned,
                                                 h_feats=trans_output.h_feats,
-                                                logits=lid_logits if return_logits else None)
+                                                logits=output["logits"] if return_logits else None)
+                                                # logits=lid_logits if return_logits else None)
         return output
 
     def infer(self,
@@ -341,6 +347,12 @@ def infer(self,
                                   
         return text, lid
 
+    def unfreeze_film(self):
+        for name, param in self.named_parameters():
+            if "film" in name:
+                logging.info(f"unfreezing {name}")
+                param.requires_grad = True
+
     # def freeze_feat_fuser(self):
     #     if self.feat_fuser is None:
     #         return
@@ -366,6 +378,9 @@ def set_train_mode(self, mode):
             self.unfreeze()
         elif mode == "frozen":
             self.freeze()
+        elif mode in ["ft-film", "ft-film-grad"]:
+            self.freeze()
+            self.unfreeze_film()
         elif mode in ["ft-transducer", "ft-transducer-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
@@ -394,8 +409,10 @@ def _train(self, train_mode: str):
         if train_mode in ["full", "frozen"]:
             super()._train(train_mode)
         elif train_mode in [
+                "ft-film",
                 "ft-transducer",
                 "hf-feats-frozen",
+                "ft-film-grad",
                 "ft-transducer-nograd",
                 "hf-feats-frozen-nograd",
                 "hf-feat-extractor-frozen",
@@ -410,8 +427,10 @@ def valid_train_modes():
         return [
             "full",
             "frozen",
+            "ft-film",
             "ft-embed-affine",
             "ft-transducer",
+            "ft-film-grad",
             "hf-feats-frozen",
             "ft-transducer-nograd",
             "hf-feats-frozen-nograd",
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 6a5c0845..f2cfad35 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -109,6 +109,9 @@ def __init__(
             self.lang_embedding = nn.Embedding(langs_size, condition_size)
         elif self.film_cond_type == "lid_pred":
             self.lang_embedding = nn.Linear(langs_size, condition_size)
+        elif self.film_cond_type == "lid_pred_embed":
+            # self.lang_embedding = nn.Linear(langs_size, condition_size)
+            pass
         if self.rnnt_loss == "k2_pruned":
             self.simple_am_proj = nn.Linear(in_feats, vocab_size)
             self.simple_lm_proj = nn.Linear(self.predictor.out_feats,
@@ -309,7 +312,8 @@ def forward(
         self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         # embed lang
-        lang_embedding = self.lang_embedding(lang)
+        if self.film_cond_type in ["one-hot", "lid_pred"]:
+            lang_embedding = self.lang_embedding(lang)
         # get y_lengths
         row_splits = y.shape.row_splits(1)
         y_lengths = row_splits[1:] - row_splits[:-1]
@@ -342,7 +346,8 @@ def decode(self,
                max_sym_per_utt: int = 1000, ) -> List[int]:
 
         # embed lang
-        lang_embedding = self.lang_embedding(lang)
+        if self.film_cond_type in ["one-hot", "lid_pred"]:
+            lang_embedding = self.lang_embedding(lang)
         if method == "time_sync_beam_search":
             return self.decode_time_sync_beam_search(x,
                                                      lang_embedding,
@@ -833,7 +838,7 @@ def add_class_args(parser,
                             
         parser.add_argument("--film-cond-type",
                             default="one-hot",
-                            choices=["one-hot", "lid_pred"],
+                            choices=["one-hot", "lid_pred", "lid_pred_embed"],
                             help=("type of the condition of FiLM layer"))
 
         parser.add_argument("--film-type",

From 63a2bd994c961b6c438bda454cc66a8695d1b797 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Fri, 30 Jun 2023 09:38:03 -0400
Subject: [PATCH 63/89] added config 2.0 to vox v2

---
 ...un_031_attack_type_verif_and_noveltydet.sh |   2 +-
 egs/voxceleb/v1.1/README.md                   |  18 +-
 ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml |   2 +-
 ...train_res2net50w26s8_xvec_stage1_v3.0.yaml |   2 +-
 ...train_res2net50w26s8_xvec_stage2_v3.0.yaml |   3 +-
 egs/voxceleb/v1.1/run_030_extract_xvectors.sh |   4 +-
 egs/voxceleb/v1.1/run_040_eval_be.sh          |   2 +-
 egs/voxceleb/v1.2/run_001_prepare_data.sh     |  34 +-
 egs/voxceleb/v2/README.md                     | 149 +-----
 ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml |  59 +++
 ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml |  63 +++
 ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml |  73 +++
 .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml  |  45 ++
 .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml    |  44 ++
 ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh |  54 ++
 egs/voxceleb/v2/run_030_extract_xvectors.sh   |  16 +-
 egs/voxceleb/v2/run_040_eval_be.sh            | 294 +++++++++-
 hyperion/bin/adv_finetune_xvector_from_wav.py |   5 +-
 hyperion/bin/apply_mvn_select_frames.py       |   9 +-
 hyperion/bin/audio_to_duration.py             |   5 +-
 hyperion/bin/compute_energy_vad.py            |   9 +-
 hyperion/bin/compute_mfcc_feats.py            |   9 +-
 hyperion/bin/copy_feats.py                    |   1 -
 hyperion/bin/decode_wav2transducer.py         |  12 +-
 hyperion/bin/decode_wav2vec2rnn_transducer.py |   5 +-
 ...l_xvec_cosine_scoring_from_adv_test_wav.py |   9 +-
 ...osine_scoring_from_adv_test_wav_wavegan.py |  10 +-
 ...l_xvec_cosine_scoring_from_art_test_wav.py |  18 +-
 .../eval_xvec_cosine_scoring_from_test_wav.py |   9 +-
 ...sine_scoring_from_transfer_adv_test_wav.py |   5 +-
 ...sine_scoring_from_transfer_art_test_wav.py |  18 +-
 hyperion/bin/eval_xvec_logits_from_wav.py     |   9 +-
 hyperion/bin/extract_wav2vec2xvectors.py      |  38 +-
 hyperion/bin/extract_xvectors_from_feats.py   |   5 +-
 hyperion/bin/extract_xvectors_from_wav.py     |   9 +-
 .../extract_xvectors_slidwin_from_feats.py    |  11 +-
 .../bin/extract_xvectors_slidwin_from_wav.py  |  11 +-
 hyperion/bin/finetune_wav2vec2transducer.py   |   5 +-
 hyperion/bin/finetune_wav2vec2xvector.py      |  18 +-
 .../bin/finetune_xvector_dfr_from_feats.py    |   5 +-
 hyperion/bin/finetune_xvector_dfr_from_wav.py |   5 +-
 hyperion/bin/finetune_xvector_from_feats.py   |   5 +-
 hyperion/bin/finetune_xvector_from_wav.py     |   9 +-
 .../generate_adv_attacks_xvector_classif.py   |  11 +-
 .../bin/generate_adv_attacks_xvector_verif.py |  11 +-
 hyperion/bin/hyperion_dataset.py              |  23 +-
 hyperion/bin/hyperion_tables.py               |  22 +-
 hyperion/bin/make_babble_noise_audio_files.py |   7 +-
 hyperion/bin/pack_wav_rirs.py                 |   9 +-
 hyperion/bin/plot_embedding_tsne.py           |   5 +-
 hyperion/bin/plot_embedding_tsne_per_class.py |   5 +-
 hyperion/bin/prepare_data.py                  |   5 +-
 hyperion/bin/preprocess_audio_files.py        |   7 +-
 .../split_dataset_into_trials_and_cohort.py   |  68 +++
 hyperion/bin/train_wav2rnn_transducer.py      |   5 +-
 hyperion/bin/train_wav2vec2rnn_transducer.py  |   5 +-
 hyperion/bin/train_wav2vec2transducer.py      |   5 +-
 hyperion/bin/train_wav2vec2xvector.py         |   5 +-
 hyperion/bin/train_xvector_from_feats.py      |   5 +-
 hyperion/bin/train_xvector_from_wav.py        |   9 +-
 hyperion/data_prep/__init__.py                |   1 +
 hyperion/data_prep/voxceleb1.py               |   2 +-
 hyperion/data_prep/voxceleb2.py               |   2 +-
 hyperion/data_prep/voxsrc22.py                |  21 +-
 .../data/class_weighted_seg_chunk_sampler.py  |   2 +-
 .../models/wav2xvectors/hf_wav2xvector.py     | 100 ++--
 hyperion/torch/torch_model.py                 |  34 +-
 hyperion/torch/tpm/hf/hf_hubert.py            |  32 ++
 hyperion/torch/tpm/hf/hf_wav2vec2.py          |   6 +
 hyperion/torch/tpm/hf/hf_wav2vec_base.py      |  84 ++-
 hyperion/torch/tpm/hf/hf_wavlm.py             |  32 ++
 hyperion/torch/trainers/torch_trainer.py      |  12 +-
 hyperion/utils/dataset.py                     | 500 ++++++++++++++----
 hyperion/utils/segment_set.py                 |  10 +-
 74 files changed, 1535 insertions(+), 628 deletions(-)
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
 create mode 100755 hyperion/bin/split_dataset_into_trials_and_cohort.py

diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
index 4ce703ba..3b93fabd 100755
--- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
+++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh
@@ -293,7 +293,7 @@ if [ $stage -le 13 ]; then
     awk '!/benign/' $list_someknown_dir/train/utt2spk > $list_someknown_dir/train_nobenign/utt2spk
     steps_backend/train_be_v1.sh --cmd "$train_cmd" \
         --plda-type splda \
-        --y-dim 6 \
+        --y-dim 5 \
 	$sign_dir/train/xvector.scp \
         $list_someknown_dir/train_nobenign \
         $be_dir 
diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md
index 73b9bb4e..3b9eeaa9 100644
--- a/egs/voxceleb/v1.1/README.md
+++ b/egs/voxceleb/v1.1/README.md
@@ -111,8 +111,11 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 |
 | | | | Cosine + QMF | 0.67 | 0.039 | 0.074 |
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 |
-| | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 |
+| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 |
 | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 |
+| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 |
+| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 |
 
 
 ### VoxCeleb 1 Entire-Clean trial list
@@ -143,8 +146,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076|
 | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 |
 | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 |
-
-
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 |
+| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 |
+| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 |
 
 ### VoxCeleb 1 Hard-Clean trial list
 
@@ -174,7 +178,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 |
 | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 |
 | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 |
-
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 |
+| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 |
+| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 |
 
 
 ### VoxSRC2022 dev
@@ -205,6 +211,10 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 |
 | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 |
 | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 |
+| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 |
+| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 |
+
 
 ## Results before 2023
 
diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
index 9e302200..1016087d 100644
--- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
+++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
@@ -68,5 +68,5 @@ trainer:
   grad_clip: 250
   use_amp: true
   log_interval: 1000
-  epochs: 35
+  epochs: 30
   eff_batch_size: 256
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
index 40fb362e..e98d6c13 100644
--- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml
@@ -68,5 +68,5 @@ trainer:
   grad_clip: 250
   use_amp: true
   log_interval: 1000
-  epochs: 35
+  epochs: 30
   eff_batch_size: 256
diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml
index 469e166b..5c9af011 100644
--- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml
+++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml
@@ -44,7 +44,8 @@ model:
   margin_warmup_epochs: 0
   intertop_margin: 0.1
   override_dropouts: true
-  dropout_rate: 0.0
+  # dropout_rate: 0.0
+  dropout_rate: 0.2
 trainer:
   optim:
     opt_type: sgd
diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh
index 8c0949f4..f933a7b2 100755
--- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh
+++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh
@@ -8,7 +8,7 @@
 set -e
 
 stage=1
-nnet_stage=1
+nnet_stage=2
 config_file=default_config.sh
 use_gpu=false
 xvec_chunk_length=12800
@@ -85,4 +85,4 @@ if [ $stage -le 2 ]; then
   done
 fi
 
-exit
+
diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh
index 0780584c..6bdbdf92 100755
--- a/egs/voxceleb/v1.1/run_040_eval_be.sh
+++ b/egs/voxceleb/v1.1/run_040_eval_be.sh
@@ -8,7 +8,7 @@
 set -e
 
 stage=1
-nnet_stage=1
+nnet_stage=2
 config_file=default_config.sh
 
 
diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh
index f956bc8c..c151e270 100755
--- a/egs/voxceleb/v1.2/run_001_prepare_data.sh
+++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh
@@ -16,26 +16,31 @@ config_file=default_config.sh
 
 if [ $stage -le 1 ];then
   # Prepare the VoxCeleb2 dataset for training.
-  hyp_utils/conda_env.sh \
-    prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
-    --cat-videos --use-kaldi-ids \
-    --output-dir data/voxceleb2cat_train
+  prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
+		  --cat-videos --use-kaldi-ids \
+		  --output-dir data/voxceleb2cat_train
 fi
 
 if [ $stage -le 2 ];then
   # prepare voxceleb1 for test
-  #hyp_utils/conda_env.sh \
-    prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \
-    --use-kaldi-ids \
-    --output-dir data/voxceleb1_test
+  # hyp_utils/conda_env.sh 
+  prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \
+		  --use-kaldi-ids \
+		  --output-dir data/voxceleb1_test
   #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
 fi
-exit
+
 if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
-  local/prepare_voxsrc22_dev.py \
-    --vox1-corpus-dir $voxceleb1_root \
-    --voxsrc22-corpus-dir $voxsrc22_root \
-    --output-dir data/voxsrc22_dev
+  prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \
+		  --vox1-corpus-dir $voxceleb1_root \
+		  --output-dir data/voxsrc22_dev
+  # local/prepare_voxsrc22_dev.py \
+  #   --vox1-corpus-dir $voxceleb1_root \
+  #   --voxsrc22-corpus-dir $voxsrc22_root \
+  #   --output-dir data/voxsrc22_dev
+  prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \
+		  --vox1-corpus-dir $voxceleb1_root \
+		  --output-dir data/voxsrc22_test
 fi
 
 # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
@@ -46,5 +51,6 @@ fi
 
 if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then
   # # split vox2 into 2 parts, for cohort and qmf training
-  local/make_vox2_trials.py --data-dir data/voxceleb2cat_train
+  split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train
+  #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train
 fi
diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md
index 5b5b93e5..c64a4b41 100644
--- a/egs/voxceleb/v2/README.md
+++ b/egs/voxceleb/v2/README.md
@@ -1,24 +1,9 @@
-# VoxCeleb V1.1
+# VoxCeleb V2
 
-Recipe for the VoxCeleb Speaker Verification Task
+Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors
 
 ## Differences w.r.t VoxCeleb V1 recipe
 
-In recipe version V1: 
-   - We compute speech augmentations and acoustic features offline and dump them to disk. 
-   - Augmentation is performed using Kaldi scripts and wav-reverbate tool
-   - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files.
-
-In this recipe:
-   - We compute speech augmentations and acoustic features are computed always on-the-fly,
-     we don't dump any features to disk. 
-   - Augmentation is performed using Hyperin SpeechAugment class.
-   - The behavior of this class is controlled 
-     by the the configuration file `conf/reverb_noise_aug.yml`, 
-     which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe.
-   - Babble noise is created offline by mixing 3-10 single speaker files.
-
-
 ## Citing
 
 ## Training Data
@@ -41,15 +26,14 @@ In this recipe:
 ## Usage
 
    - Run the run_0*.sh scripts in sequence
-   - By default it will use Light ResNet (16 base channels)
-   - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as
+   - By default it will use 
+   - For better performance use 
 ```bash
 run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
 run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true
 run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
 ```
 
-   - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh`
 
 ## Recipe Steps:
 
@@ -73,7 +57,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
       - Creates training and validation lists for x-vector training
 
    - `run_011_train_xvector.sh`
-      - Trains the x-vector network
+      - Trains the x-vector model on frozen wav2vec features
+      - Finetunes wav2vec+x-vector model
+      - Large margin finetuning of wav2vec+x-vector model
 
    - `run_030_extract_xvectors.sh`
       - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training
@@ -89,117 +75,30 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
 | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 |
-| | | | Cosine | 2.04 | 0.138 | 0.210 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA |  1.35 | 0.091 | 0.159 |
-| | | | Cosine |  1.22 | 0.082 | 0.129 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA |  1.30 | 0.090 | 0.160 |
-| | | | Cosine |  1.44 | 0.100 | 0.173 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 |
-| | | | Cosine |  1.17 | 0.081 | 0.110 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 |
-| | | | Cosine | 1.31 | 0.080 | 0.139 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 |
-| | | | Cosine | 1.23 | 0.083 | 0.136 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  1.20 | 0.095 | 0.156 |
-| | | | Cosine | 1.29 | 0.089 | 0.146 |
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.20 | 0.084 | 0.136 |
-| | | | Cosine | 1.18 | 0.078 | 0.115 |
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.11 | 0.084 | 0.145 |
-| | | | Cosine | 1.12 | 0.073 | 0.131 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  1.53 | 0.104 | 0.189 |
-| | | | Cosine | 1.31 | 0.084 | 0.132 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  0.98 | 0.066 | 0.116 |
-| | | | Cosine | 1.12 | 0.071 | 0.103 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  1.05 | 0.077 | 0.123 |
-| | | | Cosine | 0.96 | 0.065 | 0.110 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  1.04 | 0.071 | 0.118 |
-| | | | Cosine | 0.93 | 0.067 | 0.108 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA |  0.90 | 0.067 | 0.118 |
-| | | | Cosine | 0.85 | 0.060 | 0.094 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 |
-| | | | Cosine | 1.29 | 0.084 | 0.140 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 |
-
+| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 |
+| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 |
+| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 |
 
 ### VoxCeleb 1 Entire-Clean trial list
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
 | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 |
-| | | | Cosine | 1.93 | 0.122 | 0.201 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 |
-| | | | Cosine | 1.24 | 0.080 | 0.136 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 |
-| | | | Cosine | 1.30 | 0.082 | 0.150 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 |
-| | | | Cosine | 1.09 | 0.071 | 0.124 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 |
-| | | | Cosine | 1.15 | 0.076 | 0.132 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 |
-| | | | Cosine | 1.27 | 0.082 | 0.148 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  1.31 | 0.086 | 0.149 |
-| | | | Cosine | 1.22 | 0.079 | 0.134 |
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.27 | 0.082 | 0.145 |
-| | | | Cosine | 1.16 | 0.074 | 0.130 |
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.23 | 0.077 | 0.136 |
-| | | | Cosine | 1.11 | 0.071 | 0.125 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  1.46 | 0.097 | 0.173 |
-| | | | Cosine | 1.24 | 0.080 | 0.140 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  1.11 | 0.071 | 0.127 |
-| | | | Cosine | 1.05 | 0.067 | 0.117 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  1.23 | 0.078 | 0.134 |
-| | | | Cosine | 1.05 | 0.069 | 0.121 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  1.18 | 0.075 | 0.131 |
-| | | | Cosine | 0.98 | 0.063 | 0.110 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA |  1.17 | 0.072 | 0.123 |
-| | | | Cosine | 0.94 | 0.061 | 0.107 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 |
-| | | | Cosine | 1.27 | 0.079 | 0.142 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 |
-
+| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 |
+| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 |
+| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 |
 
 ### VoxCeleb 1 Hard-Clean trial list
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
 | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 |
-| | | | Cosine | 3.27 | 0.188 | 0.303 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 |
-| | | | Cosine | 2.32 | 0.139 | 0.232 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 |
-| | | | Cosine | 2.33 | 0.142 | 0.235 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 |
-| | | | Cosine | 2.14 | 0.126 | 0.203 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 |
-| | | | Cosine | 2.11 | 0.127 | 0.205 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 |
-| | | | Cosine | 2.33 | 0.141 | 0.232 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  2.42 | 0.144 | 0.245 |
-| | | | Cosine | 2.26 | 0.133 | 0.224
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  2.39 | 0.141 | 0.235 |
-| | | | Cosine | 2.17 | 0.128 | 0.215
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  2.28 | 0.131 | 0.225 |
-| | | | Cosine | 2.11 | 0.124 | 0.204 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  2.77 | 0.172 | 0.271 |
-| | | | Cosine | 2.45 | 0.141 | 0.225 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  2.07 | 0.124 | 0.201 |
-| | | | Cosine | 1.95 | 0.113 | 0.181 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  2.34 | 0.136 | 0.230 |
-| | | | Cosine | 1.99 | 0.119 | 0.196 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  2.18 | 0.127 | 0.211 |
-| | | | Cosine | 1.89 | 0.112 | 0.184 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA |  2.14 | 0.125 | 0.209 |
-| | | | Cosine | 1.84 | 0.110 | 0.186 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 |
-| | | | Cosine | 2.26 | 0.134 | 0.214 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 |
+| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 |
+| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 |
+| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 |
+
+### VoxSRC2022 dev
+
+| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
+| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
+| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 |
+| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 |
+| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 |
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..bd3e7f86
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..eed0ad1f
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 30
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..d66d6877
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..d7e3388f
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-base-plus
+  drop_layers_gt: 9
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..b2430d97
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-base-plus
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..373535c2
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmbaseplus
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh
index 67122f85..16f29841 100755
--- a/egs/voxceleb/v2/run_030_extract_xvectors.sh
+++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh
@@ -7,10 +7,10 @@
 . ./path.sh
 set -e
 
-stage=2
+stage=1
+nnet_stage=3
 config_file=default_config.sh
 use_gpu=false
-nnet_stage=3
 hf_chunk_length=120 #seconds
 xvec_chunk_length=120 #seconds
 . parse_options.sh || exit 1;
@@ -36,20 +36,20 @@ fi
 
 xvector_dir=exp/xvectors/$nnet_name
 
-if [ $stage -le 1 ]; then
+if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then
   # Extract xvectors for training LDA/PLDA
   for name in voxceleb2cat_train
   do
     if [ $plda_num_augs -eq 0 ]; then
       steps_xvec/extract_wav2vec2xvectors.sh \
 	--cmd "$xvec_cmd" --nj 100 ${xvec_args} \
-	--random-utt-length true --min-utt-length 4 --max-utt-length 140 \
+	--random-utt-length true --min-utt-length 2 --max-utt-length 30 \
     	$nnet data/${name} \
     	$xvector_dir/${name}
     else
       steps_xvec/extract_wav2vec2xvectors.sh \
 	--cmd "$xvec_cmd" --nj 300 ${xvec_args} \
-	--random-utt-length true --min-utt-length 4 --max-utt-length 140 \
+	--random-utt-length true --min-utt-length 2 --max-utt-length 30 \
 	--aug-config $plda_aug_config --num-augs $plda_num_augs \
     	$nnet data/${name} \
     	$xvector_dir/${name}_augx${plda_num_augs} \
@@ -60,7 +60,10 @@ fi
 
 if [ $stage -le 2 ]; then
   # Extracts x-vectors for evaluation
-  for name in voxceleb1_test 
+  if [ "$do_voxsrc22" == "true" ];then
+    extra_data="voxsrc22_dev"
+  fi
+  for name in voxceleb1_test $extra_data
   do
     num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
     nj=$(($num_spk < 100 ? $num_spk:100))
@@ -71,4 +74,3 @@ if [ $stage -le 2 ]; then
   done
 fi
 
-exit
diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh
index ac561344..0982abeb 100755
--- a/egs/voxceleb/v2/run_040_eval_be.sh
+++ b/egs/voxceleb/v2/run_040_eval_be.sh
@@ -7,10 +7,10 @@
 . ./path.sh
 set -e
 
-# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring
-stage=3
-config_file=default_config.sh
+stage=1
 nnet_stage=3
+config_file=default_config.sh
+
 
 . parse_options.sh || exit 1;
 . $config_file
@@ -25,6 +25,15 @@ elif [ $nnet_stage -eq 2 ];then
 elif [ $nnet_stage -eq 3 ];then
   nnet=$nnet_s3
   nnet_name=$nnet_s3_name
+elif [ $nnet_stage -eq 4 ];then
+  nnet=$nnet_s4
+  nnet_name=$nnet_s4_name
+elif [ $nnet_stage -eq 5 ];then
+  nnet=$nnet_s5
+  nnet_name=$nnet_s5_name
+elif [ $nnet_stage -eq 6 ];then
+  nnet=$nnet_s6
+  nnet_name=$nnet_s6_name
 fi
 
 plda_label=${plda_type}y${plda_y_dim}_v1
@@ -35,8 +44,12 @@ be_dir=exp/be/$nnet_name/$be_name
 score_dir=exp/scores/$nnet_name/${be_name}
 score_plda_dir=$score_dir/plda
 score_cosine_dir=exp/scores/$nnet_name/cosine
+score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm
+score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf
 
-if [ $stage -le 1 ]; then
+
+if [ "$do_plda" == "true" ];then
+  if [ $stage -le 1 ]; then
     echo "Train PLDA on Voxceleb2"
     steps_be/train_be_v1.sh \
       --cmd "$train_cmd" \
@@ -45,14 +58,12 @@ if [ $stage -le 1 ]; then
       --y_dim $plda_y_dim --z_dim $plda_z_dim \
       $xvector_dir/$plda_data/xvector.scp \
       data/$plda_data \
-      $be_dir &
-
-    wait
-fi
-
-
-if [ $stage -le 2 ];then
-
+      $be_dir
+    
+  fi
+  
+  
+  if [ $stage -le 2 ];then
     echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA"
     steps_be/eval_be_v1.sh \
       --cmd "$train_cmd" --plda_type $plda_type \
@@ -62,7 +73,7 @@ if [ $stage -le 2 ];then
       $be_dir/lda_lnorm.h5 \
       $be_dir/plda.h5 \
       $score_plda_dir/voxceleb1_scores
-    
+
     $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \
     	       local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir 
     
@@ -72,32 +83,267 @@ if [ $stage -le 2 ];then
       cat $f
       echo ""
     done
-    
+  fi
 fi
 
-score_plda_dir=$score_cosine_dir
+
 
 if [ $stage -le 3 ];then
 
-    echo "Eval Voxceleb 1 with Cosine scoring"
+  echo "Eval Voxceleb 1 with Cosine scoring"
+  steps_be/eval_be_cos.sh \
+    --cmd "$train_cmd" \
+    data/voxceleb1_test/trials \
+    data/voxceleb1_test/utt2model \
+    $xvector_dir/voxceleb1_test/xvector.scp \
+    $score_cosine_dir/voxceleb1_scores
+
+  $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \
+	     local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir 
+
+  for f in $(ls $score_cosine_dir/*_results);
+  do
+    echo $f
+    cat $f
+    echo ""
+  done
+
+fi
+
+if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
+
+    echo "Eval voxsrc2 with Cosine scoring"
     steps_be/eval_be_cos.sh --cmd "$train_cmd" \
-    	data/voxceleb1_test/trials \
-    	data/voxceleb1_test/utt2model \
-    	$xvector_dir/voxceleb1_test/xvector.scp \
-    	$score_plda_dir/voxceleb1_scores
+    	data/voxsrc22_dev/trials \
+    	data/voxsrc22_dev/utt2model \
+    	$xvector_dir/voxsrc22_dev/xvector.scp \
+    	$score_cosine_dir/voxsrc22_dev_scores &
 
-    $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \
-	local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir 
+    # steps_be/eval_be_cos.sh --cmd "$train_cmd" \
+    # 	data/voxsrc22_test/trials \
+    # 	data/voxsrc22_test/utt2model \
+    # 	$xvector_dir/voxsrc22_test/xvector.scp \
+    # 	$score_cosine_dir/voxsrc22_test_scores
 
-    for f in $(ls $score_plda_dir/*_results);
+    wait
+    $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \
+	local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir 
+
+    for f in $(ls $score_cosine_dir/voxsrc22_dev_results);
+    do
+	echo $f
+	cat $f
+	echo ""
+    done
+
+fi
+
+
+if [ "$do_snorm" == "true" ];then
+  if [ $stage -le 5 ];then
+    echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm"
+    steps_be/eval_be_cos_snorm.sh \
+      --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \
+      data/voxceleb1_test/trials \
+      data/voxceleb1_test/utt2model \
+      $xvector_dir/voxceleb1_test/xvector.scp \
+      data/voxceleb2cat_train/utt2spk \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $score_cosine_snorm_dir/voxceleb1_scores
+    
+    $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \
+	       local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir 
+    
+    for f in $(ls $score_cosine_snorm_dir/*_results);
+    do
+      echo $f
+      cat $f
+      echo ""
+    done
+  fi
+
+  if [ $stage -le 6 ];then
+    echo "Eval voxsrc2 with Cosine scoring"
+    steps_be/eval_be_cos_snorm.sh \
+      --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \
+      data/voxsrc22_dev/trials \
+      data/voxsrc22_dev/utt2model \
+      $xvector_dir/voxsrc22_dev/xvector.scp \
+      data/voxceleb2cat_train/utt2spk \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $score_cosine_snorm_dir/voxsrc22_dev_scores &
+    
+    # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \
+    # 	data/voxsrc22_test/trials \
+    # 	data/voxsrc22_test/utt2model \
+    # 	$xvector_dir/voxsrc22_test/xvector.scp \
+    #   data/voxceleb2cat_train/utt2spk \
+    #	$xvector_dir/voxceleb2cat_train/xvector.scp \
+    # 	$score_cosine_snorm_dir/voxsrc22_test_scores
+
+    wait
+    $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \
+	local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir 
+
+    for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results);
+    do
+	echo $f
+	cat $f
+	echo ""
+    done
+  fi
+fi
+
+
+if [ "$do_qmf" == "true" ];then
+  if [ $stage -le 7 ];then
+    awk '{ print $1, $2*100}' \
+	$xvector_dir/voxceleb2cat_train/utt2speech_dur \
+	> $xvector_dir/voxceleb2cat_train/utt2num_frames
+    
+    echo "Train QMF in Vox2"
+    steps_be/train_be_cos_qmf.sh \
+      --cmd "$train_cmd" --coh-nbest 1000 \
+      data/voxceleb2cat_train/trials \
+      data/voxceleb2cat_train/utt2model \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $xvector_dir/voxceleb2cat_train/utt2num_frames \
+      data/voxceleb2cat_train/snorm_utt2spk \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $score_cosine_qmf_dir/voxceleb2_qmf_scores
+
+  fi
+
+  if [ $stage -le 8 ];then
+    awk '{ print $1, $2*100}' \
+	$xvector_dir/voxceleb1_test/utt2speech_dur \
+	> $xvector_dir/voxceleb1_test/utt2num_frames
+    
+    echo "Eval Voxceleb 1 with Cosine scoring"
+    steps_be/eval_be_cos_qmf.sh \
+      --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \
+      data/voxceleb1_test/trials \
+      data/voxceleb1_test/utt2model \
+      $xvector_dir/voxceleb1_test/xvector.scp \
+      $xvector_dir/voxceleb1_test/utt2num_frames \
+      data/voxceleb2cat_train/utt2spk \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $score_cosine_qmf_dir/qmf.h5 \
+      $score_cosine_qmf_dir/voxceleb1_scores
+    
+    $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \
+	       local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir 
+    $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \
+	       local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm
+    $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \
+	       local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf
+
+    for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results);
+    do
+      echo $f
+      cat $f
+      echo ""
+    done
+
+  fi
+
+  if [ $stage -le 9 ];then
+    awk '{ print $1, $2*100}' \
+	$xvector_dir/voxsrc22_dev/utt2speech_dur \
+	> $xvector_dir/voxsrc22_dev/utt2num_frames
+
+    echo "Eval voxsrc2 with Cosine scoring"
+    steps_be/eval_be_cos_qmf.sh \
+      --cmd "$train_cmd  --mem 20G" --coh-nbest 1000  \
+      data/voxsrc22_dev/trials \
+      data/voxsrc22_dev/utt2model \
+      $xvector_dir/voxsrc22_dev/xvector.scp \
+      $xvector_dir/voxsrc22_dev/utt2num_frames \
+      data/voxceleb2cat_train/utt2spk \
+      $xvector_dir/voxceleb2cat_train/xvector.scp \
+      $score_cosine_qmf_dir/qmf.h5 \
+      $score_cosine_qmf_dir/voxsrc22_dev_scores &
+
+    # awk '{ print $1, $2*100}' \
+    # 	$xvector_dir/voxsrc22_test/utt2speech_dur \
+    # 	> $xvector_dir/voxsrc22_test/utt2num_frames
+    # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \
+    # 	data/voxsrc22_test/trials \
+    # 	data/voxsrc22_test/utt2model \
+    # 	$xvector_dir/voxsrc22_test/xvector.scp \
+    #	$xvector_dir/voxsrc22_test/utt2num_frames \
+    #	data/voxceleb2cat_train/utt2spk \
+    #	$xvector_dir/voxceleb2cat_train/xvector.scp \
+    #	$score_cosine_qmf_dir/qmf.h5 \
+    # 	$score_cosine_qmf_dir/voxsrc22_test_scores
+
+    wait
+    $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \
+	local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir 
+    $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \
+	local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm
+    $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \
+	local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf
+
+    for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results);
     do
 	echo $f
 	cat $f
 	echo ""
     done
+  fi
+
+fi
+
+if [ "$do_pca" != "true" ];then
+  exit 0
+fi
+
+
+be_name=pca_r${pca_var_r}
+
+xvector_dir=exp/xvectors/$nnet_name
+be_dir=exp/be/$nnet_name/$be_name
+score_dir=exp/scores/$nnet_name/${be_name}
+score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine
+score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm
+score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf
+
+be_dir=exp/be/$nnet_name/
+score_be_dir=$score_dir/pca_r${pca_var_r}
+
+if [ $stage -le 10 ]; then
+  echo "Train projection on Voxceleb2"
+  $train_cmd $be_dir/log/train_be.log \
+	     hyp_utils/conda_env.sh \
+	     steps_be/train_be_proj_v1.py \
+	     --v-file scp:$xvector_dir/$plda_data/xvector.scp \
+	     --train-list data/$plda_data/utt2spk \
+	     --output-path $be_dir \
+	     --pca.pca-var-r $pca_var_r
 
 fi
 
 
-exit
+if [ $stage -le 11 ];then
+
+  echo "Eval Voxceleb 1 with Cosine scoring"
+  steps_be/eval_be_cos.sh \
+    --cmd "$train_cmd" \
+    --preproc-file $be_dir/preproc.h5 \
+    data/voxceleb1_test/trials \
+    data/voxceleb1_test/utt2model \
+    $xvector_dir/voxceleb1_test/xvector.scp \
+    $score_cosine_dir/voxceleb1_scores
 
+  $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \
+	     local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir 
+
+  for f in $(ls $score_cosine_dir/*_results);
+  do
+    echo $f
+    cat $f
+    echo ""
+  done
+
+fi
diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py
index 7be882e0..f45b84a0 100755
--- a/hyperion/bin/adv_finetune_xvector_from_wav.py
+++ b/hyperion/bin/adv_finetune_xvector_from_wav.py
@@ -11,9 +11,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -32,6 +29,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py
index f5a3ce15..bdf53786 100755
--- a/hyperion/bin/apply_mvn_select_frames.py
+++ b/hyperion/bin/apply_mvn_select_frames.py
@@ -10,13 +10,6 @@
 import time
 
 import numpy as np
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import RandomAccessDataReaderFactory as RDRF
@@ -25,6 +18,8 @@
 from hyperion.np.feats import MeanVarianceNorm as MVN
 from hyperion.utils import Utt2Info
 from hyperion.utils.kaldi_matrix import compression_methods
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def process_feats(
diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py
index 38e8dff2..ac8852a4 100755
--- a/hyperion/bin/audio_to_duration.py
+++ b/hyperion/bin/audio_to_duration.py
@@ -9,12 +9,11 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.utils import SegmentSet
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def audio_to_duration(audio_file, output_file, **kwargs):
diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py
index 058f982a..e9773fff 100755
--- a/hyperion/bin/compute_energy_vad.py
+++ b/hyperion/bin/compute_energy_vad.py
@@ -9,17 +9,12 @@
 import time
 
 import numpy as np
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.feats import EnergyVAD
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def compute_vad(input_path, output_path, write_num_frames, **kwargs):
diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py
index ca6e26f7..442e4141 100755
--- a/hyperion/bin/compute_mfcc_feats.py
+++ b/hyperion/bin/compute_mfcc_feats.py
@@ -9,19 +9,14 @@
 import time
 
 import numpy as np
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.io import SequentialDataReaderFactory as DRF
 from hyperion.io import compression_methods
 from hyperion.np.feats import MFCC
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def compute_mfcc_feats(
diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py
index 0385cc55..4549caec 100755
--- a/hyperion/bin/copy_feats.py
+++ b/hyperion/bin/copy_feats.py
@@ -12,7 +12,6 @@
 import time
 
 import numpy as np
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import CopyFeats as CF
 
diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py
index c7de38f1..972b247c 100755
--- a/hyperion/bin/decode_wav2transducer.py
+++ b/hyperion/bin/decode_wav2transducer.py
@@ -13,13 +13,6 @@
 import numpy as np
 import pandas as pd
 import sentencepiece as spm
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -27,10 +20,13 @@
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.augment import SpeechAugment
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search
+from hyperion.torch.models.wav2transducer.beam_search import (beam_search,
+                                                              greedy_search)
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py
index 8ef8d414..4fdc3140 100755
--- a/hyperion/bin/decode_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py
@@ -13,9 +13,6 @@
 import numpy as np
 import pandas as pd
 import sentencepiece as spm
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -29,6 +26,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
index 50fd5088..7c9d4104 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
@@ -10,13 +10,6 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -33,6 +26,8 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
index 5697404d..fb0d402c 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
@@ -7,19 +7,11 @@
 import os
 import sys
 import time
-
 # [Added Sonal May21]
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -37,6 +29,8 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 torch.backends.cudnn.enabled = False
 
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
index 0ca1f740..2d5baa17 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
@@ -11,17 +11,10 @@
 
 import numpy as np
 import pandas as pd
-from art.classifiers import PyTorchClassifier
-from art.estimators.classification import PyTorchClassifier
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
+from art.classifiers import PyTorchClassifier
+from art.estimators.classification import PyTorchClassifier
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -29,15 +22,16 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import (
-    ARTAttackFactory as AttackFactory,
-)
+from hyperion.torch.adv_attacks.art_attack_factory import \
+    ARTAttackFactory as AttackFactory
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
index 49a762af..76af5d75 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
@@ -10,13 +10,6 @@
 import time
 
 import numpy as np
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -31,6 +24,8 @@
 from hyperion.torch.utils.misc import l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
index b2c111ca..f33402a1 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
@@ -10,9 +10,6 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
@@ -29,6 +26,8 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
index 8b6c8dae..f94dc497 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
@@ -11,17 +11,10 @@
 
 import numpy as np
 import pandas as pd
-from art.classifiers import PyTorchClassifier
-from art.estimators.classification import PyTorchClassifier
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
+from art.classifiers import PyTorchClassifier
+from art.estimators.classification import PyTorchClassifier
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -29,15 +22,16 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import (
-    ARTAttackFactory as AttackFactory,
-)
+from hyperion.torch.adv_attacks.art_attack_factory import \
+    ARTAttackFactory as AttackFactory
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py
index 98ba76b5..9efbd6dd 100755
--- a/hyperion/bin/eval_xvec_logits_from_wav.py
+++ b/hyperion/bin/eval_xvec_logits_from_wav.py
@@ -11,13 +11,6 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
@@ -28,6 +21,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py
index c4c4676f..6f7d269e 100755
--- a/hyperion/bin/extract_wav2vec2xvectors.py
+++ b/hyperion/bin/extract_wav2vec2xvectors.py
@@ -11,15 +11,8 @@
 
 import numpy as np
 import pandas as pd
-import torchaudio.transforms as tat
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
+import torchaudio.transforms as tat
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -28,26 +21,8 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-
-resamplers = {}
-
-
-def get_resampler(source_fs, target_fs):
-    if source_fs in resamplers:
-        return resamplers[source_fs]
-
-    resampler = tat.Resample(
-        int(source_fs),
-        int(target_fs),
-        lowpass_filter_width=64,
-        rolloff=0.9475937167399596,
-        resampling_method="kaiser_window",
-        beta=14.769656459379492,
-    )
-    resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy()
-    resamplers[source_fs] = resampler_f
-    return resampler_f
-
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 resamplers = {}
 
@@ -168,7 +143,10 @@ def extract_xvectors(
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
+                v_reader = VRF.create(
+                    vad_spec,
+                    path_prefix=vad_path_prefix,
+                )
 
             while not reader.eof():
                 t1 = time.time()
@@ -240,7 +218,7 @@ def extract_xvectors(
                     writer.write([key], [y])
                     if write_speech_dur is not None:
                         keys.append(key)
-                        info.append(str(x.shape[1] * fs))
+                        info.append(str(x.shape[1] / fs))
 
                     t8 = time.time()
                     read_time = t2 - t1
diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py
index 926e0bcc..13ad4277 100755
--- a/hyperion/bin/extract_xvectors_from_feats.py
+++ b/hyperion/bin/extract_xvectors_from_feats.py
@@ -10,9 +10,6 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
@@ -22,6 +19,8 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py
index f49a5fb0..577bbae7 100755
--- a/hyperion/bin/extract_xvectors_from_wav.py
+++ b/hyperion/bin/extract_xvectors_from_wav.py
@@ -11,13 +11,6 @@
 
 import numpy as np
 import pandas as pd
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
@@ -28,6 +21,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
index eaf0a5cc..a54c4d64 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
@@ -10,15 +10,8 @@
 import time
 
 import numpy as np
-import yaml
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
+import yaml
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialDataReaderFactory as DRF
@@ -27,6 +20,8 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
index 9dc0aa2c..8939ba91 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
@@ -11,15 +11,8 @@
 
 import numpy as np
 import pandas as pd
-import yaml
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
+import yaml
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -29,6 +22,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_device(use_gpu):
diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py
index df267e72..6f17f800 100755
--- a/hyperion/bin/finetune_wav2vec2transducer.py
+++ b/hyperion/bin/finetune_wav2vec2transducer.py
@@ -12,9 +12,6 @@
 
 import k2
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -25,6 +22,8 @@
 from hyperion.torch.models import HFWav2Vec2Transducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py
index b3edd9b5..fc3c7084 100755
--- a/hyperion/bin/finetune_wav2vec2xvector.py
+++ b/hyperion/bin/finetune_wav2vec2xvector.py
@@ -11,9 +11,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -26,6 +23,8 @@
                                    HFWavLM2ResNet1dXVector)
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
@@ -79,7 +78,12 @@ def init_model(num_classes, in_model_file, rank, **kwargs):
 
 
 def init_hard_prototype_mining(model, train_loader, val_loader, rank):
-    if not train_loader.batch_sampler.hard_prototype_mining:
+    try:
+        hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining
+    except:
+        hard_prototype_mining = False
+
+    if not hard_prototype_mining:
         return
 
     if rank == 0:
@@ -118,7 +122,11 @@ def train_model(gpu_id, args):
         logging.info("trainer args={}".format(trn_args))
     metrics = {"acc": CategoricalAccuracy()}
     trainer = Trainer(
-        model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args,
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
     )
     trainer.load_last_checkpoint()
     trainer.fit(train_loader, val_loader)
diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py
index 2ac01025..17cafb85 100755
--- a/hyperion/bin/finetune_xvector_dfr_from_feats.py
+++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py
@@ -12,9 +12,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -25,6 +22,8 @@
 from hyperion.torch.models import XVector as XVec
 from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer
 from hyperion.torch.utils import ddp, open_device
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs):
diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py
index ff97d3ca..f7832a47 100755
--- a/hyperion/bin/finetune_xvector_dfr_from_wav.py
+++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py
@@ -10,9 +10,6 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -24,6 +21,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer
 from hyperion.torch.utils import ddp, open_device
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_data(
diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py
index 7a1fb5a9..ac9c2d0b 100755
--- a/hyperion/bin/finetune_xvector_from_feats.py
+++ b/hyperion/bin/finetune_xvector_from_feats.py
@@ -11,9 +11,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
@@ -23,6 +20,8 @@
 from hyperion.torch.models import XVector as XVec
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp, open_device
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs):
diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py
index 7d602709..1c7cbe58 100755
--- a/hyperion/bin/finetune_xvector_from_wav.py
+++ b/hyperion/bin/finetune_xvector_from_wav.py
@@ -10,13 +10,6 @@
 import time
 from pathlib import Path
 
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
@@ -32,6 +25,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py
index 8c6f38a6..209915c5 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_classif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py
@@ -11,16 +11,9 @@
 
 import numpy as np
 import pandas as pd
-import yaml
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
+import yaml
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -31,6 +24,8 @@
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialNdx, Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def read_utt_list(list_file, class2int_file, part_idx, num_parts):
diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py
index fbd3a5fb..363e3afc 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_verif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py
@@ -11,16 +11,9 @@
 
 import numpy as np
 import pandas as pd
-import yaml
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 import torch.nn as nn
+import yaml
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -35,6 +28,8 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py
index 9e7bac5c..c5a3f6b9 100644
--- a/hyperion/bin/hyperion_dataset.py
+++ b/hyperion/bin/hyperion_dataset.py
@@ -4,27 +4,14 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 import logging
-from typing import Optional, Union, List
 from pathlib import Path
-
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
+from typing import List, Optional, Union
 
 from hyperion.hyp_defs import config_logger
-from hyperion.utils import (
-    PathLike,
-    Dataset,
-    InfoTable,
-    RecordingSet,
-    FeatureSet,
-    ClassInfo,
-    EnrollmentMap,
-    SegmentSet,
-)
+from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet,
+                            InfoTable, PathLike, RecordingSet, SegmentSet)
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 subcommands = ["add_features"]
 # table_dict = {
diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py
index a79a1dca..5a5f0b4f 100755
--- a/hyperion/bin/hyperion_tables.py
+++ b/hyperion/bin/hyperion_tables.py
@@ -4,26 +4,14 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 import logging
-from typing import Optional, Union, List
 from pathlib import Path
-
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
+from typing import List, Optional, Union
 
 from hyperion.hyp_defs import config_logger
-from hyperion.utils import (
-    PathLike,
-    InfoTable,
-    RecordingSet,
-    FeatureSet,
-    ClassInfo,
-    EnrollmentMap,
-    SegmentSet,
-)
+from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable,
+                            PathLike, RecordingSet, SegmentSet)
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 subcommands = ["cat"]
 table_dict = {
diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py
index 972ff01f..4a356037 100755
--- a/hyperion/bin/make_babble_noise_audio_files.py
+++ b/hyperion/bin/make_babble_noise_audio_files.py
@@ -10,15 +10,14 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from scipy import ndimage, signal
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import AudioWriter as Writer
 from hyperion.io import RandomAccessAudioReader as AR
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from scipy import ndimage, signal
 
 
 def make_noise(xs):
diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py
index 4aafa075..78ac59c1 100755
--- a/hyperion/bin/pack_wav_rirs.py
+++ b/hyperion/bin/pack_wav_rirs.py
@@ -10,16 +10,11 @@
 import time
 
 import numpy as np
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def pack_wav_rirs(input_path, output_spec, **kwargs):
diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py
index e011dfe8..e2157e3e 100755
--- a/hyperion/bin/plot_embedding_tsne.py
+++ b/hyperion/bin/plot_embedding_tsne.py
@@ -13,13 +13,12 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
-                          ArgumentParser, namespace_to_dict)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import RandomAccessDataReaderFactory as DRF
 from hyperion.np.transforms import PCA, LNorm, SklTSNE
 from hyperion.utils import SegmentSet
+from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
+                          ArgumentParser, namespace_to_dict)
 
 matplotlib.use("Agg")
 colors = ["b", "g", "r", "c", "m", "y", "k"]
diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py
index 6f35f074..6af0202c 100755
--- a/hyperion/bin/plot_embedding_tsne_per_class.py
+++ b/hyperion/bin/plot_embedding_tsne_per_class.py
@@ -13,15 +13,14 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
-                          ArgumentParser, namespace_to_dict)
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import RandomAccessDataReaderFactory as DRF
 from hyperion.np.clustering import AHC
 from hyperion.np.transforms import PCA, LNorm, SklTSNE
 from hyperion.utils import SegmentSet
 from hyperion.utils.math import cosine_scoring
+from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
+                          ArgumentParser, namespace_to_dict)
 
 matplotlib.use("Agg")
 colors = ["b", "g", "r", "c", "m", "y", "k"]
diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py
index 4105f482..e90ad0f7 100755
--- a/hyperion/bin/prepare_data.py
+++ b/hyperion/bin/prepare_data.py
@@ -6,11 +6,10 @@
 import logging
 from pathlib import Path
 
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 from hyperion.data_prep import DataPrep
 from hyperion.hyp_defs import config_logger
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 
 def make_parser(data_prep_class):
diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py
index 2f4e5cbc..e8adfd16 100755
--- a/hyperion/bin/preprocess_audio_files.py
+++ b/hyperion/bin/preprocess_audio_files.py
@@ -10,15 +10,14 @@
 import time
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from scipy import ndimage, signal
-
 from hyperion.hyp_defs import config_logger
 from hyperion.io import AudioWriter as Writer
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.utils import Utt2Info
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
+from scipy import ndimage, signal
 
 
 def process_vad(vad, length, fs, dilation, erosion):
diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py
new file mode 100755
index 00000000..24ec10bf
--- /dev/null
+++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from pathlib import Path
+
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import Dataset
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ActionYesNo,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description=(
+            """Split speakers in dataset into test speaker to create ASV trials and 
+        cohort speakers for S-Norm"""
+        )
+    )
+
+    parser.add_argument("--data-dir", required=True, help="Path to dataset")
+    parser.add_argument(
+        "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials"
+    )
+    parser.add_argument(
+        "--num-trial-speakers",
+        type=int,
+        default=1000,
+        help="number of speakers to create trials",
+    )
+    parser.add_argument(
+        "--intra-gender",
+        default=True,
+        action=ActionYesNo,
+        help="Whether we create intra gender trials or not",
+    )
+    parser.add_argument("--seed", type=int, default=1123, help="random seed")
+    parser.add_argument(
+        "--trials-dir", default=None, help="Path to output trials dataset"
+    )
+    parser.add_argument(
+        "--cohort-dir", default=None, help="Path to output cohort dataset"
+    )
+
+    args = parser.parse_args()
+    config_logger(1)
+    data_dir = args.data_dir
+    cohort_dir = args.cohort_dir
+    cohort_dir = f"{data_dir}_cohort" if cohort_dir is None else cohort_dir
+    trials_dir = args.trials_dir
+    trials_dir = f"{data_dir}_trials" if trials_dir is None else trials_dir
+
+    del args.data_dir
+    del args.cohort_dir
+    del args.trials_dir
+    args = namespace_to_dict(args)
+
+    dataset = Dataset.load(data_dir)
+    trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args)
+    trials_dataset.save(trials_dir)
+    cohort_dataset.save(cohort_dir)
diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py
index 26fcf72c..8930b299 100755
--- a/hyperion/bin/train_wav2rnn_transducer.py
+++ b/hyperion/bin/train_wav2rnn_transducer.py
@@ -12,9 +12,6 @@
 
 import k2
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -23,6 +20,8 @@
 from hyperion.torch.models import Wav2RNNRNNTransducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py
index 5daffb6d..7018c406 100755
--- a/hyperion/bin/train_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/train_wav2vec2rnn_transducer.py
@@ -12,9 +12,6 @@
 
 import k2
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -25,6 +22,8 @@
                                    HFWav2Vec2RNNTransducer)
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py
index ce53be86..55f3b996 100755
--- a/hyperion/bin/train_wav2vec2transducer.py
+++ b/hyperion/bin/train_wav2vec2transducer.py
@@ -12,9 +12,6 @@
 
 import k2
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -24,6 +21,8 @@
 from hyperion.torch.models import HFWav2Vec2Transducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py
index 5e7ecafa..8e1653b1 100755
--- a/hyperion/bin/train_wav2vec2xvector.py
+++ b/hyperion/bin/train_wav2vec2xvector.py
@@ -11,9 +11,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -25,6 +22,8 @@
                                    HFWavLM2ResNet1dXVector)
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py
index 7f4ab0fa..71bba080 100755
--- a/hyperion/bin/train_xvector_from_feats.py
+++ b/hyperion/bin/train_xvector_from_feats.py
@@ -11,9 +11,6 @@
 from pathlib import Path
 
 import numpy as np
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
 import torch
 import torch.nn as nn
 from hyperion.hyp_defs import config_logger, set_float_cpu
@@ -28,6 +25,8 @@
 from hyperion.torch.models import TransformerXVectorV1 as TFXVec
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py
index a210d429..b2e36cac 100755
--- a/hyperion/bin/train_xvector_from_wav.py
+++ b/hyperion/bin/train_xvector_from_wav.py
@@ -8,13 +8,6 @@
 import os
 from pathlib import Path
 
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
-
 import torch
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
@@ -29,6 +22,8 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
+from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
+                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py
index 9ae59246..e978e219 100644
--- a/hyperion/data_prep/__init__.py
+++ b/hyperion/data_prep/__init__.py
@@ -6,3 +6,4 @@
 from .data_prep import DataPrep
 from .voxceleb2 import VoxCeleb2DataPrep
 from .voxceleb1 import VoxCeleb1DataPrep
+from .voxsrc22 import VoxSRC22DataPrep
diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py
index c23b64ff..b3958605 100644
--- a/hyperion/data_prep/voxceleb1.py
+++ b/hyperion/data_prep/voxceleb1.py
@@ -330,7 +330,7 @@ def prepare(self):
         logging.info("making dataset")
         dataset = Dataset(
             segments,
-            classes={"speaker": speakers, "languages": languages},
+            classes={"speaker": speakers, "language_est": languages},
             recordings={"recordings": recs},
             enrollments=enrollments,
             trials=trials,
diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py
index bef34ec9..29ad3e44 100644
--- a/hyperion/data_prep/voxceleb2.py
+++ b/hyperion/data_prep/voxceleb2.py
@@ -251,7 +251,7 @@ def prepare(self):
         logging.info("making dataset")
         dataset = Dataset(
             segments,
-            {"speaker": speakers, "languages": languages},
+            {"speaker": speakers, "language_est": languages},
             {"recordings": recs},
         )
         logging.info("saving dataset at %s", self.output_dir)
diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py
index 1999262a..79369149 100644
--- a/hyperion/data_prep/voxsrc22.py
+++ b/hyperion/data_prep/voxsrc22.py
@@ -53,7 +53,7 @@ def __init__(
 
     @staticmethod
     def dataset_name():
-        return "voxceleb2"
+        return "voxsrc22"
 
     @staticmethod
     def add_class_args(parser):
@@ -117,11 +117,13 @@ def prepare_track12_dev(self):
                 vox1_segmentid.append(s)
 
         vox1_rec_files = [
-            glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid
-        ]
-        vox22_rec_files = [
-            glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid
+            glob.glob(f"{self.vox1_corpus_dir}/**/{s}")[0] for s in vox1_segmentid
         ]
+        # vox22_rec_files = [
+        #     glob.glob(f"{self.corpus_dir}/**/{s}")[0] for s in vox22_segmentid
+        # ]
+        vox22_rec_files = [f"{self.corpus_dir}/{s}" for s in vox22_segmentid]
+
         rec_ids = vox22_segmentid + vox1_segmentid
         rec_files = vox22_rec_files + vox1_rec_files
 
@@ -135,7 +137,11 @@ def prepare_track12_dev(self):
             recs["target_sample_freq"] = self.target_sample_freq
 
         logging.info("making SegmentsSet")
-        segments = pd.DataFrame({"id": rec_ids,})
+        segments = pd.DataFrame(
+            {
+                "id": rec_ids,
+            }
+        )
         segments = SegmentSet(segments)
         segments.sort()
 
@@ -150,7 +156,8 @@ def prepare_track12_dev(self):
         logging.info("saving dataset at %s", self.output_dir)
         dataset.save(self.output_dir)
         logging.info(
-            "datasets containts %d segments", len(segments),
+            "datasets containts %d segments",
+            len(segments),
         )
 
     #             wav_file = voxsrc22_corpus_dir / file_id
diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
index 7fbfbd71..6ee00307 100644
--- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
+++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py
@@ -205,7 +205,7 @@ def _set_class_weights(self):
             self.class_info.set_uniform_weights()
         elif self.weight_mode == "data-prior":
             weights = self.class_info["total_duration"].values
-            self.class_info.set_weights(self, weights)
+            self.class_info.set_weights(weights)
 
         if self.weight_exponent != 1.0:
             self.class_info.exp_weights(self.weight_exponent)
diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
index 5599fa1e..c2bcdf99 100644
--- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
+++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
@@ -26,11 +26,9 @@ class HFWav2XVector(TorchModel):
                            than one layer is used.
     """
 
-    def __init__(self,
-                 hf_feats,
-                 xvector,
-                 feat_fusion_start=0,
-                 feat_fusion_method="weighted-avg"):
+    def __init__(
+        self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg"
+    ):
 
         super().__init__()
         self.hf_feats = hf_feats
@@ -51,12 +49,9 @@ def _make_fuser(self):
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif self.feat_fusion_method == "linear":
             self.feat_fuser = nn.Linear(num_layers, 1, bias=False)
-            self.feat_fuser.weight.data = torch.ones(1,
-                                                     num_layers) / num_layers
+            self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers
         elif self.feat_fusion_method == "cat":
-            self.feat_fuser = nn.Linear(num_layers * layer_dim,
-                                        layer_dim,
-                                        bias=False)
+            self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False)
 
     def _fuse_hid_feats(self, hid_feats):
         """Fuses the hidden features from the Wav2Vec model.
@@ -71,7 +66,7 @@ def _fuse_hid_feats(self, hid_feats):
             # There is only one layer of features
             return hid_feats[0]
 
-        hid_feats = hid_feats[self.feat_fusion_start:]
+        hid_feats = hid_feats[self.feat_fusion_start :]
         if self.feat_fusion_method == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
@@ -125,14 +120,14 @@ def rebuild_output_layer(
             num_subcenters=num_subcenters,
         )
 
-    def forward_feats(self,
-                      x,
-                      x_lengths,
-                      return_feat_layers=None,
-                      chunk_length=0,
-                      detach_chunks=False):
-        return_hid_states = (False if return_feat_layers is None
-                             and self.feat_fusion_method == "last" else True)
+    def forward_feats(
+        self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False
+    ):
+        return_hid_states = (
+            False
+            if return_feat_layers is None and self.feat_fusion_method == "last"
+            else True
+        )
         with self._hf_context:
             hf_output = self.hf_feats(
                 x,
@@ -154,7 +149,8 @@ def forward_feats(self,
             # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time)
             # as the hidden features of the x-vector encoder.
             hid_feats = [
-                f.transpose(1, 2) for i, f in enumerate(hid_feats)
+                f.transpose(1, 2)
+                for i, f in enumerate(hid_feats)
                 if i in return_feat_layers
             ]
         else:
@@ -194,7 +190,8 @@ def forward(
           "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features)
         """
         feats, hid_feats, feat_lengths = self.forward_feats(
-            x, x_lengths, return_feat_layers)
+            x, x_lengths, return_feat_layers
+        )
         output = self.xvector(
             feats,
             feat_lengths,
@@ -230,16 +227,17 @@ def extract_embed(
             x, x_lengths = remove_silence(x, x_lengths)
 
         feats, _, feat_lengths = self.forward_feats(
-            x,
-            x_lengths,
-            chunk_length=hf_chunk_length,
-            detach_chunks=detach_chunks)
-        xvec_chunk_length = int(xvec_chunk_length *
-                                self.hf_feats.sample_frequency *
-                                feats.size(-1) // x.size(-1))
-        return self.xvector.extract_embed(feats, feat_lengths,
-                                          xvec_chunk_length, embed_layer,
-                                          detach_chunks)
+            x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks
+        )
+        xvec_chunk_length = int(
+            xvec_chunk_length
+            * self.hf_feats.sample_frequency
+            * feats.size(-1)
+            // x.size(-1)
+        )
+        return self.xvector.extract_embed(
+            feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks
+        )
 
     def freeze_feat_fuser(self):
         if self.feat_fuser is None:
@@ -258,6 +256,23 @@ def freeze_hf_feats(self):
     def freeze_hf_feature_encoder(self):
         self.hf_feats.freeze_feature_encoder()
 
+    def has_param_groups(self):
+        return self.hf_feats.has_param_groups()
+
+    def trainable_param_groups(self):
+        if not self.has_param_groups():
+            return self.trainable_parameters()
+
+        param_groups = self.hf_feats.trainable_param_groups()
+        if self.feat_fusion_method == "weighted-avg":
+            if self.feat_fuser.requires_grad:
+                param_groups.append({"params": self.feat_fuser})
+        else:
+            param_groups.append({"params": self.feat_fuser.parameters()})
+
+        param_groups.append({"params": self.xvector.trainable_parameters()})
+        return param_groups
+
     def set_train_mode(self, mode):
         if mode == self._train_mode:
             return
@@ -302,11 +317,11 @@ def _train(self, train_mode: str):
             self.hf_feats.train()
             self.xvector._train("ft-embed_affine")
         elif train_mode in [
-                "ft-xvector",
-                "hf-feats-frozen",
-                "ft-xvector-nograd",
-                "hf-feats-frozen-nograd",
-                "hf-feat-extractor-frozen",
+            "ft-xvector",
+            "hf-feats-frozen",
+            "ft-xvector-nograd",
+            "hf-feats-frozen-nograd",
+            "hf-feat-extractor-frozen",
         ]:
             self.hf_feats.train()
             self.xvector._train("full")
@@ -369,16 +384,19 @@ def add_class_args(parser, prefix=None, skip=set()):
             "--feat-fusion-start",
             default=0,
             type=int,
-            help=
-            ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to"
-             "the wav2vec num_layers"),
+            help=(
+                "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to"
+                "the wav2vec num_layers"
+            ),
         )
         parser.add_argument(
             "--feat-fusion-method",
             default="weighted-avg",
             choices=["weighted-avg", "linear", "cat", "last"],
-            help=("method to fuse the hidden layers from the wav2vec model "
-                  "in [weighted-avg, cat]"),
+            help=(
+                "method to fuse the hidden layers from the wav2vec model "
+                "in [weighted-avg, cat]"
+            ),
         )
 
         if prefix is not None:
diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py
index 65e5884d..0cb887ca 100644
--- a/hyperion/torch/torch_model.py
+++ b/hyperion/torch/torch_model.py
@@ -13,8 +13,8 @@
 
 
 class TorchModel(nn.Module):
-    """Base class for all Pytorch Models and NNet architectures
-    """
+    """Base class for all Pytorch Models and NNet architectures"""
+
     registry = {}
 
     def __init_subclass__(cls, **kwargs):
@@ -45,6 +45,12 @@ def non_trainable_parameters(self, recurse: bool = True):
             if not param.requires_grad:
                 yield param
 
+    def has_param_groups(self):
+        return False
+
+    def trainable_param_groups(self):
+        return self.trainable_parameters()
+
     def freeze(self):
         for param in self.parameters():
             param.requires_grad = False
@@ -109,10 +115,9 @@ def save(self, file_path):
             os.makedirs(file_dir, exist_ok=True)
 
         config = self.get_config()
-        torch.save({
-            "model_cfg": self.get_config(),
-            "model_state_dict": self.state_dict()
-        })
+        torch.save(
+            {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}
+        )
 
     @staticmethod
     def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None):
@@ -132,8 +137,7 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None):
 
     @classmethod
     def load(cls, file_path=None, cfg=None, state_dict=None):
-        cfg, state_dict = TorchModel._load_cfg_state_dict(
-            file_path, cfg, state_dict)
+        cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict)
 
         model = cls(**cfg)
         if state_dict is not None:
@@ -148,14 +152,15 @@ def get_loss(self):
 
     @property
     def device(self):
-        devices = {param.device
-                   for param in self.parameters()
-                   } | {buf.device
-                        for buf in self.buffers()}
+        devices = {param.device for param in self.parameters()} | {
+            buf.device for buf in self.buffers()
+        }
         if len(devices) != 1:
             raise RuntimeError(
                 "Cannot determine device: {} different devices found".format(
-                    len(devices)))
+                    len(devices)
+                )
+            )
 
         return next(iter(devices))
 
@@ -217,5 +222,4 @@ def auto_load(file_path, extra_objs={}, map_location=None):
                     # if it failed the 3 trials raise exception
                     raise err
                 # remove module prefix when is trained with dataparallel
-                state_dict = ODict(
-                    (p.sub("", k), v) for k, v in state_dict.items())
+                state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items())
diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py
index b2198924..2957e433 100644
--- a/hyperion/torch/tpm/hf/hf_hubert.py
+++ b/hyperion/torch/tpm/hf/hf_hubert.py
@@ -135,6 +135,8 @@ class HFHubert(HFWav2VecBase):
           chunk by chunk, if it is too long to fit in GPU.
         right_encoder_context: (`int`): future context frames used by the transformer encoder.
         sample_frequency: (`int`) waveform sample frequency used to train the model.
+        feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
+        encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
     """
 
     def __init__(
@@ -182,6 +184,8 @@ def __init__(
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
+        feat_extract_lr: Optional[float] = None,
+        encoder_lr: Optional[float] = None,
     ):
 
         super().__init__(
@@ -199,6 +203,8 @@ def __init__(
             left_encoder_context=left_encoder_context,
             right_encoder_context=right_encoder_context,
             sample_frequency=sample_frequency,
+            feat_extract_lr=feat_extract_lr,
+            encoder_lr=encoder_lr,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
@@ -287,6 +293,32 @@ def num_encoder_layers(self):
     def hidden_size(self):
         return self.hf_config.hidden_size
 
+    def change_dropouts(
+        self,
+        hidden_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        feat_proj_dropout: float = 0.1,
+        **kwargs,
+    ):
+        import transformers.models.hubert.modeling_hubert as t
+
+        self.hf_model.config.hidden_dropout = hidden_dropout
+        self.hf_model.config.activation_dropout = activation_dropout
+        self.hf_model.config.attention_dropout = attention_dropout
+        self.hf_model.config.feat_proj_dropout = feat_proj_dropout
+
+        self.hf_model.feature_projection.dropout.p = feat_proj_dropout
+        for module in self.hf_model.encoder.modules():
+            if isinstance(module, nn.Dropout):
+                module.p = hidden_dropout
+
+        for module in self.hf_model.encoder.modules():
+            if isinstance(module, t.HubertAttention):
+                module.dropout = activation_dropout
+            if isinstance(module, t.HubertFeatureProjection):
+                module.intermediate_dropout.p = activation_dropout
+
     def drop_upper_layers(self, max_layers: int):
         if max_layers >= self.hf_config.num_hidden_layers:
             return
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index e1f21153..26da7beb 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -148,6 +148,8 @@ class HFWav2Vec2(HFWav2VecBase):
           chunk by chunk, if it is too long to fit in GPU.
         right_encoder_context: (`int`): future context frames used by the transformer encoder.
         sample_frequency: (`int`) waveform sample frequency used to train the model.
+        feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
+        encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
     """
 
     def __init__(
@@ -200,6 +202,8 @@ def __init__(
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
+        feat_extract_lr: Optional[float] = None,
+        encoder_lr: Optional[float] = None,
     ):
 
         super().__init__(
@@ -217,6 +221,8 @@ def __init__(
             left_encoder_context=left_encoder_context,
             right_encoder_context=right_encoder_context,
             sample_frequency=sample_frequency,
+            feat_extract_lr=feat_extract_lr,
+            encoder_lr=encoder_lr,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index b0a815c7..a9c4ddef 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -53,6 +53,8 @@ class HFWav2VecBase(TorchModel):
           chunk by chunk, if it is too long to fit in GPU.
         right_encoder_context: (`int`): future context frames used by the transformer encoder.
         sample_frequency: (`int`) waveform sample frequency used to train the model.
+        feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
+        encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
     """
 
     def __init__(
@@ -71,6 +73,8 @@ def __init__(
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
+        feat_extract_lr: Optional[float] = None,
+        encoder_lr: Optional[float] = None,
     ):
         super().__init__()
         self.pretrained_model_path = pretrained_model_path
@@ -84,6 +88,8 @@ def __init__(
         self.override_spec_augment = override_spec_augment
         self.right_encoder_context = right_encoder_context
         self.left_encoder_context = left_encoder_context
+        self.feat_extract_lr = feat_extract_lr
+        self.encoder_lr = encoder_lr
 
         if pretrained_model_path is not None and not ignore_pretrained:
             rank = ddp_get_rank()
@@ -215,7 +221,14 @@ def out_shape(self, in_shape):
         C = self.hf_model.config.hidden_size
         return (in_shape[0], out_length, C)
 
-    def change_config(self, override_dropouts, override_spec_augment, **kwargs):
+    def change_config(
+        self,
+        override_dropouts: bool,
+        override_spec_augment: bool,
+        feat_extract_lr: Optional[float] = None,
+        encoder_lr: Optional[float] = None,
+        **kwargs,
+    ):
         if override_spec_augment:
             logging.info("overriding speech augment")
             self.change_spec_augment(**kwargs)
@@ -224,6 +237,9 @@ def change_config(self, override_dropouts, override_spec_augment, **kwargs):
             logging.info("overriding hf model dropouts")
             self.change_dropouts(**kwargs)
 
+        self.feat_extract_lr = feat_extract_lr
+        self.encoder_lr = encoder_lr
+
     def change_spec_augment(
         self,
         apply_spec_augment: bool = True,
@@ -249,6 +265,35 @@ def change_dropouts(self, **kwargs):
     def freeze_feature_encoder(self):
         self.hf_model.freeze_feature_encoder()
 
+    def has_param_groups(self):
+        return self.feat_extract_lr is not None or self.encoder_lr is not None
+
+    def trainable_param_groups(self):
+        if not self.has_param_groups():
+            return self.trainable_parameters()
+
+        if self.feat_extract_lr == self.encoder_lr:
+            return [{"params": self.trainable_parameters(), "lr": self.encoder_lr}]
+
+        param_groups = [
+            {"params": self.hf_model.feature_extractor.parameters()},
+            {"params": self.hf_model.feature_projection.parameters()},
+            {"params": self.hf_model.encoder.parameters()},
+        ]
+        if self.hf_model.adapter is not None:
+            param_groups.append({"params": self.hf_model.adapter.parameters()})
+
+        if self.feat_extract_lr is not None:
+            param_groups[0]["lr"] = self.feat_extract_lr
+            param_groups[1]["lr"] = self.feat_extract_lr
+
+        if self.encoder_lr is not None:
+            param_groups[2]["lr"] = self.encoder_lr
+            if len(param_groups) == 4:
+                param_groups[3]["lr"] = self.encoder_lr
+
+        return param_groups
+
     @property
     def hf_config(self):
         return self.hf_model.config
@@ -570,7 +615,6 @@ def add_class_args(parser, prefix=None, skip=set()):
             help=("file path or HuggingFace Hub path to pre-trained model"),
         )
 
-
         parser.add_argument(
             "--normalize-input",
             default=True,
@@ -659,6 +703,24 @@ def add_class_args(parser, prefix=None, skip=set()):
                 "when the signal is evaluated chunk by chunk."
             ),
         )
+        parser.add_argument(
+            "--feat-extractor-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for conv feature extractor, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
+        parser.add_argument(
+            "--encoder-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for transformer encoder, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
@@ -696,6 +758,24 @@ def add_finetune_args(parser, prefix=None, skip=set()):
                 "arguments instead of the defaults in the pretrained model."
             ),
         )
+        parser.add_argument(
+            "--feat-extractor-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for conv feature extractor, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
+        parser.add_argument(
+            "--encoder-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for transformer encoder, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py
index 0d5c5ad3..e1b67d81 100644
--- a/hyperion/torch/tpm/hf/hf_wavlm.py
+++ b/hyperion/torch/tpm/hf/hf_wavlm.py
@@ -148,6 +148,8 @@ class HFWavLM(HFWav2VecBase):
           chunk by chunk, if it is too long to fit in GPU.
         right_encoder_context: (`int`): future context frames used by the transformer encoder.
         sample_frequency: (`int`) waveform sample frequency used to train the model.
+        feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
+        encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
     """
 
     def __init__(
@@ -200,6 +202,8 @@ def __init__(
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
+        feat_extract_lr: Optional[float] = None,
+        encoder_lr: Optional[float] = None,
     ):
 
         super().__init__(
@@ -217,6 +221,8 @@ def __init__(
             left_encoder_context=left_encoder_context,
             right_encoder_context=right_encoder_context,
             sample_frequency=sample_frequency,
+            feat_extract_lr=feat_extract_lr,
+            encoder_lr=encoder_lr,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
@@ -310,6 +316,32 @@ def num_encoder_layers(self):
     def hidden_size(self):
         return self.hf_config.hidden_size
 
+    def change_dropouts(
+        self,
+        hidden_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        feat_proj_dropout: float = 0.1,
+        **kwargs,
+    ):
+        import transformers.models.wavlm.modeling_wavlm as t
+
+        self.hf_model.config.hidden_dropout = hidden_dropout
+        self.hf_model.config.activation_dropout = activation_dropout
+        self.hf_model.config.attention_dropout = attention_dropout
+        self.hf_model.config.feat_proj_dropout = feat_proj_dropout
+
+        self.hf_model.feature_projection.dropout.p = feat_proj_dropout
+        for module in self.hf_model.encoder.modules():
+            if isinstance(module, nn.Dropout):
+                module.p = hidden_dropout
+
+        for module in self.hf_model.encoder.modules():
+            if isinstance(module, t.WavLMAttention):
+                module.dropout = activation_dropout
+            if isinstance(module, t.WavLMFeatureProjection):
+                module.intermediate_dropout.p = activation_dropout
+
     def drop_upper_layers(self, max_layers: int):
         if max_layers >= self.hf_config.num_hidden_layers:
             return
diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py
index c8565d1d..5e41747c 100644
--- a/hyperion/torch/trainers/torch_trainer.py
+++ b/hyperion/torch/trainers/torch_trainer.py
@@ -163,7 +163,9 @@ def __init__(
                 oss = False if ddp_type == DDPType.DDP else True
                 self.optimizer = self._make_optimizer(optim, self.model, oss=oss)
                 self.model = TorchDDP(
-                    self.model, device_ids=[device], output_device=device,
+                    self.model,
+                    device_ids=[device],
+                    output_device=device,
                 )
             elif ddp_type == DDPType.OSS_SHARDED_DDP:
                 self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
@@ -424,7 +426,9 @@ def _make_optimizer(self, optim, model, oss=False):
         opt_args["oss"] = oss
         if self.rank == 0:
             logging.info("optimizer args={}".format(opt_args))
-        optimizer = OF.create(model.parameters(), **opt_args)
+
+        # optimizer = OF.create(model.parameters(), **opt_args)
+        optimizer = OF.create(model.trainable_param_groups(), **opt_args)
         return optimizer
 
     def _make_lr_sched(self, lr_sched, optim):
@@ -458,8 +462,8 @@ def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb):
 
     def _get_lr(self):
         """Returns the current learning rate to show in the loggers"""
-        for param_group in self.optimizer.param_groups:
-            return param_group["lr"]
+        lrs = [param_group["lr"] for param_group in self.optimizer.param_groups]
+        return max(lrs)
 
     def _compute_grad_acc_steps(self, data_loader):
         if self.eff_batch_size is None:
diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py
index 0ef81ab6..d1d969fb 100644
--- a/hyperion/utils/dataset.py
+++ b/hyperion/utils/dataset.py
@@ -2,10 +2,13 @@
  Copyright 2022 Johns Hopkins University  (Author: Jesus Villalba)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
-
+import logging
 from pathlib import Path
 from typing import Dict, Optional, Union
-
+from copy import deepcopy
+import math
+import numpy as np
+import pandas as pd
 import yaml
 
 from .class_info import ClassInfo
@@ -93,10 +96,13 @@ def _parse_dict_args(self, data, types):
 
         return objects, paths
 
+    def clone(self):
+        return deepcopy(self)
+
     def segments(self, keep_loaded: bool = True):
         if self._segments is None:
             assert self._segments_path is not None
-            segments = SegmentSet.load(self.segments_path, sep=self.table_sep)
+            segments = SegmentSet.load(self._segments_path, sep=self.table_sep)
             if keep_loaded:
                 self._segments = segments
             return segments
@@ -111,6 +117,7 @@ def recordings_value(self, key: str, keep_loaded: bool = True):
             )
             if keep_loaded:
                 self._recordings[key] = recordings
+            return recordings
 
         return self._recordings[key]
 
@@ -120,6 +127,7 @@ def features_value(self, key: str, keep_loaded: bool = True):
             features = FeatureSet.load(self._features_paths[key], sep=self.table_sep)
             if keep_loaded:
                 self._features[key] = features
+            return features
 
         return self._features[key]
 
@@ -129,6 +137,7 @@ def classes_value(self, key: str, keep_loaded: bool = True):
             classes = ClassInfo.load(self._classes_paths[key], self.table_sep)
             if keep_loaded:
                 self._classes[key] = classes
+            return classes
 
         return self._classes[key]
 
@@ -140,6 +149,7 @@ def enrollments_value(self, key: str, keep_loaded: bool = True):
             )
             if keep_loaded:
                 self._enrollments[key] = enrollments
+            return enrollments
 
         return self._enrollments[key]
 
@@ -156,6 +166,7 @@ def trials_value(self, key: str, keep_loaded: bool = True):
 
             if keep_loaded:
                 self._trials[key] = trials
+            return trials
 
         return self._trials[key]
 
@@ -194,6 +205,49 @@ def trials(self, keep_loaded: bool = True):
             for key in self._trials.keys():
                 yield key, self.trials_value(key, keep_loaded)
 
+    # def add_recordings(self, recordings: Dict[str, Union[RecordingSet, PathLike]]):
+    #     recordings, recordings_paths = self._parse_dict_args(recordings, RecordingSet)
+    #     if self._recordings is None:
+    #         self._recordings = self._recordings_paths = {}
+    #     self._recordings.update(recordings)
+    #     self._recordings_paths.update(recordings_paths)
+
+    # def add_features(self, features: Dict[str, Union[FeatureSet, PathLike]]):
+    #     features, features_paths = self._parse_dict_args(features, FeatureSet)
+    #     if self._features is None:
+    #         self._features = self._features_paths = {}
+    #     self._features.update(features)
+    #     self._features_paths.update(features_paths)
+
+    # def add_classes(self, classes: Dict[str, Union[ClassInfo, PathLike]]):
+    #     classes, classes_paths = self._parse_dict_args(classes, ClassInfo)
+    #     if self._classes is None:
+    #         self._classes = self._classes_paths = {}
+    #     self._classes.update(classes)
+    #     self._classes_paths.update(classes_paths)
+
+    # def add_enrollments(self, enrollments: Dict[str, Union[EnrollmentMap, PathLike]]):
+    #     enrollments, enrollments_paths = self._parse_dict_args(
+    #         enrollments,
+    #         EnrollmentMap,
+    #     )
+    #     if self._enrollments is None:
+    #         self._enrollments = self._enrollments_paths = {}
+    #     self._enrollments.update(enrollments)
+    #     self._enrollments_paths.update(enrollments_paths)
+
+    # def add_trials(
+    #     self, trials: Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]]
+    # ):
+    #     trials, trials_paths = self._parse_dict_args(
+    #         trials,
+    #         (TrialKey, TrialNdx, SparseTrialKey),
+    #     )
+    #     if self._trials is None:
+    #         self._trials = self._trials_paths = {}
+    #     self._trials.update(trials)
+    #     self._trials_paths.update(trials_paths)
+
     @staticmethod
     def resolve_dataset_path(dataset_path):
         dataset_path = Path(dataset_path)
@@ -209,6 +263,8 @@ def resolve_dataset_path(dataset_path):
 
     @staticmethod
     def resolve_file_path(dataset_dir, file_path):
+        dataset_dir = Path(dataset_dir)
+        file_path = Path(file_path)
         if file_path.is_file():
             return file_path
 
@@ -274,95 +330,100 @@ def save_changed(
             if update_paths:
                 self._segments_path = file_path
 
-        file_names = {}
-        for k in self._recordings.keys():
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            if (
-                self._recordings is not None
-                or file_path != self._recordings_paths[k]
-                or not file_path.exists()
-            ):
-                v = self.recordings_value(k, keep_loaded=False)
-                v.save(file_path, sep=table_sep)
-                if update_paths:
-                    self._recordings_paths[k] = file_path
-
-        if file_names:
-            dataset["recordings"] = file_names
-
-        file_names = {}
-        for k in self._features.keys():
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            if (
-                self._features is not None
-                or file_path != self._features_paths[k]
-                or not file_path.exists()
-            ):
-                v = self.features_value(k, keep_loaded=False)
-                v.save(file_path, sep=table_sep)
-                if update_paths:
-                    self._features_paths[k] = file_path
-
-        if file_names:
-            dataset["features"] = file_names
-
-        file_names = {}
-        for k, v in self._classes.keys():
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            if (
-                self._classes is not None
-                or file_path != self._classes_paths[k]
-                or not file_path.exists()
-            ):
-                v = self.classes_value(k, keep_loaded=False)
-                v.save(file_path, sep=table_sep)
-                if update_paths:
-                    self._classes_paths[k] = file_path
-
-        if file_names:
-            dataset["classes"] = file_names
-
-        file_names = {}
-        for k, v in self._enrollments.keys():
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            if (
-                self._enrollments is not None
-                or file_path != self._enrollments_paths[k]
-                or not file_path.exists()
-            ):
-                v = self.enrollments_value(k, keep_loaded=False)
-                v.save(file_path, sep=table_sep)
-                if update_paths:
-                    self._enrollments_paths[k] = file_path
-
-        if file_names:
-            dataset["enrollments"] = file_names
-
-        file_names = {}
-        for k, v in self._trials.keys():
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            if (
-                self._trials is not None
-                or file_path != self._trials_paths[k]
-                or not file_path.exists()
-            ):
-                v = self.trials_value(k, keep_loaded=False)
-                v.save(file_path)
-                if update_paths:
-                    self._trials_paths[k] = file_path
-
-        if file_names:
-            dataset["trials"] = file_names
+        if self._recordings is not None:
+            file_names = {}
+            for k in self._recordings.keys():
+                file_name = k + table_ext
+                file_names[k] = file_name
+                file_path = dataset_dir / file_name
+                if (
+                    self._recordings[k] is not None
+                    or file_path != self._recordings_paths[k]
+                    or not file_path.exists()
+                ):
+                    v = self.recordings_value(k, keep_loaded=False)
+                    v.save(file_path, sep=table_sep)
+                    if update_paths:
+                        self._recordings_paths[k] = file_path
+
+            if file_names:
+                dataset["recordings"] = file_names
+
+        if self._features is not None:
+            file_names = {}
+            for k in self._features.keys():
+                file_name = k + table_ext
+                file_names[k] = file_name
+                file_path = dataset_dir / file_name
+                if (
+                    self._features[k] is not None
+                    or file_path != self._features_paths[k]
+                    or not file_path.exists()
+                ):
+                    v = self.features_value(k, keep_loaded=False)
+                    v.save(file_path, sep=table_sep)
+                    if update_paths:
+                        self._features_paths[k] = file_path
+
+            if file_names:
+                dataset["features"] = file_names
+
+        if self._classes is not None:
+            file_names = {}
+            for k in self._classes.keys():
+                file_name = k + table_ext
+                file_names[k] = file_name
+                file_path = dataset_dir / file_name
+                if (
+                    self._classes[k] is not None
+                    or file_path != self._classes_paths[k]
+                    or not file_path.exists()
+                ):
+                    v = self.classes_value(k, keep_loaded=False)
+                    v.save(file_path, sep=table_sep)
+                    if update_paths:
+                        self._classes_paths[k] = file_path
+
+            if file_names:
+                dataset["classes"] = file_names
+
+        if self._enrollments is not None:
+            file_names = {}
+            for k in self._enrollments.keys():
+                file_name = k + table_ext
+                file_names[k] = file_name
+                file_path = dataset_dir / file_name
+                if (
+                    self._enrollments[k] is not None
+                    or file_path != self._enrollments_paths[k]
+                    or not file_path.exists()
+                ):
+                    v = self.enrollments_value(k, keep_loaded=False)
+                    v.save(file_path, sep=table_sep)
+                    if update_paths:
+                        self._enrollments_paths[k] = file_path
+
+            if file_names:
+                dataset["enrollments"] = file_names
+
+        if self._trials is not None:
+            file_names = {}
+            for k in self._trials.keys():
+                file_name = k + table_ext
+                file_names[k] = file_name
+                file_path = dataset_dir / file_name
+                if (
+                    self._trials[k] is not None
+                    or file_path != self._trials_paths[k]
+                    or not file_path.exists()
+                ):
+                    v = self.trials_value(k, keep_loaded=False)
+                    v.save(file_path)
+                    if update_paths:
+                        self._trials_paths[k] = file_path
+
+            if file_names:
+                dataset["trials"] = file_names
 
         with open(dataset_file, "w") as f:
             yaml.dump(dataset, f)
@@ -491,7 +552,7 @@ def load(
 
         """
         dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path)
-        with open(dataset_file, "w") as f:
+        with open(dataset_file, "r") as f:
             dataset = yaml.safe_load(f)
 
         assert "segments" in dataset
@@ -503,27 +564,27 @@ def load(
         trials = None
         if "classes" in dataset:
             classes = {}
-            for k, v in dataset["classes"]:
+            for k, v in dataset["classes"].items():
                 classes[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "recordings" in dataset:
             recordings = {}
-            for k, v in dataset["recordings"]:
+            for k, v in dataset["recordings"].items():
                 recordings[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "features" in dataset:
             features = {}
-            for k, v in dataset["features"]:
+            for k, v in dataset["features"].items():
                 features[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "enrollments" in dataset:
             enrollments = {}
-            for k, v in dataset["enrollments"]:
+            for k, v in dataset["enrollments"].items():
                 enrollments[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "trials" in dataset:
             trials = {}
-            for k, v in dataset["trials"]:
+            for k, v in dataset["trials"].items():
                 trials[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         dataset = cls(
@@ -541,6 +602,10 @@ def load(
         return dataset
 
     def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]):
+        if self._features is None:
+            self._features = {}
+            self._features_paths = {}
+
         if isinstance(features, (str, Path)):
             self._features[features_name] = None
             self._features_paths[features_name] = features
@@ -555,6 +620,10 @@ def add_recordings(
         recordings_name: str,
         recordings: Union[PathLike, RecordingSet],
     ):
+        if self._recordings is None:
+            self._recordings = {}
+            self._recordings_paths = {}
+
         if isinstance(features, (str, Path)):
             self._recordings[features_name] = None
             self._recordings_paths[recordings_name] = recordings
@@ -565,6 +634,10 @@ def add_recordings(
             raise ValueError()
 
     def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]):
+        if self._classes is None:
+            self._classes = {}
+            self._classes_paths = {}
+
         if isinstance(classes, (str, Path)):
             self._classes[features_name] = None
             self._classes_paths[classes_name] = classes
@@ -579,8 +652,12 @@ def add_enrollments(
         enrollments_name: str,
         enrollments: Union[PathLike, EnrollmentMap],
     ):
-        if isinstance(features, (str, Path)):
-            self._enrollments[features_name] = None
+        if self._enrollments is None:
+            self._enrollments = {}
+            self._enrollments_paths = {}
+
+        if isinstance(enrollments, (str, Path)):
+            self._enrollments[enrollments_name] = None
             self._enrollments_paths[enrollments_name] = enrollments
         elif isinstance(enrollments, EnrollmentMap):
             self._enrollments[enrollments_name] = enrollments
@@ -593,7 +670,11 @@ def add_trials(
         trials_name: str,
         trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey],
     ):
-        if isinstance(features, (str, Path)):
+        if self._trials is None:
+            self._trials = {}
+            self._trials_paths = {}
+
+        if isinstance(trials, (str, Path)):
             self._trials[features_name] = None
             self._trials_paths[trials_name] = trials
         elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)):
@@ -601,3 +682,220 @@ def add_trials(
             self._trials_paths[trials_name] = None
         else:
             raise ValueError()
+
+    def remove_features(self, features_name: str):
+        if self._features_paths[features_name] is not None:
+            file_path = Path(self._features_paths[features_name])
+            if file_path.is_file():
+                file_path.unlink()
+
+        del self._features[features_name]
+        del self._features_paths[features_name]
+
+    def remove_recordings(
+        self,
+        recordings_name: str,
+    ):
+        if self._recordingsr_paths[recordings_name] is not None:
+            file_path = Path(self._recordings_paths[recordings_name])
+            if file_path.is_file():
+                file_path.unlink()
+
+        del self._recordings[recordings_name]
+        del self._recordings_paths[recordings_name]
+
+    def remove_classes(self, classes_name: str):
+        if self._classes_paths[classes_name] is not None:
+            file_path = Path(self._classes_paths[classes_name])
+            if file_path.is_file():
+                file_path.unlink()
+
+        del self._classes[classes_name]
+        del self._classes_paths[classes_name]
+
+    def remove_enrollments(
+        self,
+        enrollments_name: str,
+    ):
+        if self._enrollments_paths[enrollments_name] is not None:
+            file_path = Path(self._enrollments_paths[enrollments_name])
+            if file_path.is_file():
+                file_path.unlink()
+
+        del self._enrollments[enrollments_name]
+        del self._enrollments_paths[enrollments_name]
+
+    def remove_trials(
+        self,
+        trials_name: str,
+    ):
+        if self._trials_paths[trials_name] is not None:
+            file_path = Path(self._trials_paths[trials_name])
+            if file_path.is_file():
+                file_path.unlink()
+
+        del self._trials[trials_name]
+        del self._trials_paths[trials_name]
+
+    def set_segments(self, segments: Union[PathLike, SegmentSet]):
+        if isinstance(segments, SegmentSet):
+            self._segments = segments
+        else:
+            self._segments_path = segments
+
+    def clean(self):
+        rec_ids = self.segments().recording_ids()
+        for k, table in self.recordings():
+            table = table.loc[table["id"].isin(rec_ids)].copy()
+            self._recordings[k] = RecordingSet(table)
+
+        ids = self.segments()["id"].values
+        for k, table in self.features():
+            table = table.loc[table["id"].isin(ids)].copy()
+            self._features[k] = FeatureSet(table)
+
+        for k, table in self.classes():
+            class_ids = self.segments()[k].unique()
+            table = table[table["id"].isin(class_ids)].copy()
+            self._classes[k] = ClassInfo(table)
+
+        remove_keys = []
+        for k, table in self.enrollments():
+            table = table.loc[table["segmentid"].isin(ids)].copy()
+            if len(table) > 0:
+                self._enrollments[k] = EnrollmentMap(table)
+            else:
+                remove_keys.append(k)
+
+        for k in remove_keys:
+            self.remove_enrollments(k)
+
+        remove_keys = []
+        for k, key in self.trials():
+            keep_ids = [cur_id for cur_id in key.seg_set if cur_id in ids]
+            if keep_ids:
+                key = key.filter(key.model_set, keep_ids, keep=True)
+                self._trials[k] = key
+            else:
+                remove_keys.append(k)
+
+        for k in remove_keys:
+            self.remove_trials(k)
+
+    def _split_into_trials_and_cohort(
+        self,
+        segments: SegmentSet,
+        num_tar_trials: int,
+        num_trial_speakers: int,
+        seed: int,
+    ):
+        # select test speakers
+        rng = np.random.RandomState(seed=seed)
+
+        spks = segments["speaker"].unique()
+        trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False)
+        snorm_segments = SegmentSet(segments[~segments["speaker"].isin(trial_spks)])
+
+        trial_segments = segments[segments["speaker"].isin(trial_spks)]
+        # solution of 2nd degree eq.
+        # num_spks * n (n-1) /2 = num_trials
+        num_segs_per_spk = int(
+            math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_trial_speakers)) / 2)
+        )
+
+        n = num_trial_speakers * num_segs_per_spk
+        seg_ids = rng.choice(trial_segments["id"], size=(n,), replace=False)
+        trial_segments = SegmentSet(segments[segments["id"].isin(seg_ids)])
+        seg_ids = trial_segments["id"].values
+        class_ids = trial_segments["speaker"].values
+        tar = np.zeros((n - 1, n), dtype=bool)
+        non = np.zeros((n - 1, n), dtype=bool)
+
+        ntar = 0
+        nnon = 0
+        for i in range(n - 1):
+            for j in range(i + 1, n):
+                if class_ids[i] == class_ids[j]:
+                    tar[i, j] = True
+                else:
+                    non[i, j] = True
+
+        logging.info("Got ntar=%d and nnon=%d", tar.sum(), non.sum())
+        trials = TrialKey(seg_ids[:-1], seg_ids, tar, non)
+        df_enr = pd.DataFrame({"id": seg_ids[:-1], "segmentid": seg_ids[:-1]})
+        enrollments = EnrollmentMap(df_enr)
+        return trials, enrollments, snorm_segments
+
+    def split_into_trials_and_cohort(
+        self,
+        num_1k_tar_trials: int,
+        num_trial_speakers: int,
+        intra_gender: bool = True,
+        trials_name="trials_qmf",
+        seed=1123,
+    ):
+        """When training quality measure fusion in, e.g., VoxCeleb recipe.
+        We split the data into 2 parts:
+            1) used to calculate SV scores to train the fusion
+            2) cohort used to calculate the S-Norm parameters used in the QMF.
+
+        The trials_file will be stored in the current dataset
+        A new dataset is created with only the cohort speakers
+
+        Args:
+          num_1k_tar_trials: num of 1000 target trials.
+          num_trial_speakers: number of spks used to create trials.
+          intra_gender: if True, no cross gender trials are done.
+
+        Returns:
+          Dataset used for trials with trial list.
+          Dataset used for cohort.
+        """
+        num_tar_trials = num_1k_tar_trials * 1000
+        if intra_gender:
+            num_tar_trials = num_tar_trials // 2
+            num_trial_speakers = num_trial_speakers // 2
+            segments = self.segments()
+            segments_male = SegmentSet(segments[segments["gender"] == "m"])
+            segments_female = SegmentSet(segments[segments["gender"] == "f"])
+            trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort(
+                segments_male,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
+            )
+            (
+                trials_female,
+                enroll_female,
+                cohort_female,
+            ) = self._split_into_trials_and_cohort(
+                segments_female,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
+            )
+            trials = TrialKey.merge([trials_male, trials_female])
+            enroll = EnrollmentMap.cat([enroll_male, enroll_female])
+            cohort = SegmentSet.cat([cohort_male, cohort_female])
+        else:
+            segments = self.segments()
+            trials, enroll, cohort = self._split_into_trials_and_cohort(
+                segments,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
+            )
+
+        dataset_trials = self.clone()
+        segments = self.segments()
+        trials_segments = SegmentSet(segments.loc[segments["id"].isin(trials.seg_set)])
+        dataset_trials.set_segments(trials_segments)
+        dataset_trials.add_trials("trials", trials)
+        dataset_trials.add_enrollments("enrollments", enroll)
+        dataset_trials.clean()
+
+        dataset_cohort = self.clone()
+        dataset_cohort.set_segments(cohort)
+        dataset_cohort.clean()
+
+        return dataset_trials, dataset_cohort
diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py
index 1852d25d..6aef5bb2 100644
--- a/hyperion/utils/segment_set.py
+++ b/hyperion/utils/segment_set.py
@@ -8,7 +8,7 @@
 
 class SegmentSet(InfoTable):
     """Class to store information about a speech segment
-       Internally, it uses a pandas table.
+    Internally, it uses a pandas table.
     """
 
     def __init__(self, df):
@@ -29,7 +29,13 @@ def has_time_marks(self):
     def has_recording_ids(self):
         return "recording_id" in self.df
 
-    def recording_ids(self, ids):
+    def recording_ids(self, ids=None):
+        if ids is None:
+            if "recording_id" in self.df:
+                return self.df["recording_id"]
+            else:
+                return self.df["id"]
+
         if "recording_id" in self.df:
             return self.df.loc[ids, "recording_id"]
 

From ac71e9aed4e1a5b490ddc0f37dfa31ec4e3b2d31 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-71-145.ec2.internal>
Date: Sat, 1 Jul 2023 08:38:42 +0000
Subject: [PATCH 64/89] update joint-training of LID-FILM-ASR

---
 ...ucer_ecapadnn512x3_1layer_stage1_v2.1.yaml | 140 +++++++++++++++
 ...ucer_ecapadnn512x3_1layer_stage2_v2.1.yaml |  91 ++++++++++
 ...uned_filmed_transducer_lid_v2.1_13langs.sh |  43 +++++
 .../initailize_joint_film_lid_model_bias.py   |  74 ++++++++
 .../v1/run_025_train_film_asr_lid.sh          |  18 +-
 ...wav2vec2rnn_film_transducer_languageid.py} | 168 ++++++++----------
 hyperion/torch/data/audio_dataset.py          |   2 +-
 hyperion/torch/layer_blocks/film_blocks.py    |  17 +-
 .../layer_blocks/transducer_film_joiner.py    |  22 ++-
 .../layer_blocks/transducer_film_predictor.py |   8 +-
 .../hf_wav2rnn_film_transducer_languageid.py  |  88 +++++++--
 .../hf_wav2rnn_transducer_languageid.py       |   1 +
 ..._wav2vec2rnn_film_transducer_languageid.py |  30 ++++
 .../narchs/rnn_film_transducer_decoder.py     |  37 ++--
 14 files changed, 593 insertions(+), 146 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
 create mode 100644 egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py
 rename hyperion/bin/{finetune_wav2vec2transducer_languageid.py => finetune_wav2vec2rnn_film_transducer_languageid.py} (60%)

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml
new file mode 100644
index 00000000..7347e8b4
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml
@@ -0,0 +1,140 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m   
+  transducer:
+    decoder:
+      film_cond_type: lid_pred_embed
+      reduction: mean
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  languageid:
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+      resb_channels:
+        - 512
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+      dropout_rate: 0.2
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: arc-softmax
+    cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  loss_weight_embed: 10
+  lid_length: 3.0
+  
+  feat_fusion_method_transducer: film-fused-feature
+  feat_fusion_method_lid: weighted-avg
+  feat_fusion_start_transducer: 2
+  feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
new file mode 100644
index 00000000..377ea296
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
@@ -0,0 +1,91 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  transducer:
+    decoder:
+      prune_range: 15
+      override_dropouts: false
+  languageid:
+    cos_scale: 32.0
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 10.0
+  loss_weight_embed: 10
+  lid_length: 3.0
+  
+  # feat_fusion_method_transducer: film-fused-feature
+  # feat_fusion_method_lid: weighted-avg
+  # feat_fusion_start_transducer: 2
+  # feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
new file mode 100644
index 00000000..b0e39914
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
@@ -0,0 +1,43 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0009.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.1.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py
new file mode 100644
index 00000000..3bc5148f
--- /dev/null
+++ b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py
@@ -0,0 +1,74 @@
+import torch
+import sys
+# arguments example
+#
+
+ASR_model = torch.load(sys.argv[1])
+LID_model = torch.load(sys.argv[2])
+joint_model = torch.load(sys.argv[3])
+
+output_model = sys.argv[4]
+
+
+def check_update_parameters(joint_state_dict, new_joint_state_dict):
+    shape_changed_parameters = []
+    unchanged_parameters = []
+    changed_parameters = []
+    unloaded_parameters = []
+    for name, param in joint_state_dict.items():
+        new_param = new_joint_state_dict[name].to(param.device)
+        if param.shape != new_param.shape:
+            shape_changed_parameters.append(name)
+        elif torch.all(torch.eq(param, new_param)):
+            unchanged_parameters.append(name)
+        else:
+            changed_parameters.append(name)
+    print("Shape changed parameters: {}".format(shape_changed_parameters))
+    print("Unchanged parameters: {}".format(unchanged_parameters))
+    print("Changed parameters: {}".format(changed_parameters))
+
+
+
+def copy_model_parameters(ASR_model, LID_model, joint_model, output_model):
+    ASR_state_dict = ASR_model["model_state_dict"]
+    LID_state_dict = LID_model["model_state_dict"]
+
+    LID_state_dict = {"module." + name: param for name, param in LID_state_dict.items()} 
+
+    joint_state_dict = joint_model["model_state_dict"]
+
+    hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name}
+    transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and ("transducer" in name or "film" in name)}
+    languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name}
+    
+
+    film_update_state_dict = {}
+    for name, param in joint_state_dict.items():
+        if "linear_scale.weight" in name and "lid_film" in name:
+            film_update_state_dict[name] = torch.zeros_like(param)
+        elif "linear_scale.bias" in name and "lid_film" in name:
+            film_update_state_dict[name] = torch.ones_like(param)
+        elif ("linear_shift.weight" in name or "linear_shift.bias" in name) and "lid_film" in name:
+            film_update_state_dict[name] = torch.zeros_like(param)
+    
+    new_joint_state_dict = joint_state_dict.copy()
+    new_joint_state_dict.update(hf_feats_update_state_dict)
+    new_joint_state_dict.update(transducer_update_state_dict)
+    new_joint_state_dict.update(languageid_update_state_dict)
+    new_joint_state_dict.update(film_update_state_dict)
+
+    # import pdb;pdb.set_trace()
+    
+    new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"]
+    new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"]
+
+    
+    joint_model["model_state_dict"] = new_joint_state_dict
+    joint_model["epoch"] =1
+
+    check_update_parameters(joint_state_dict, new_joint_state_dict)
+    torch.save(joint_model, output_model)
+
+
+
+copy_model_parameters(ASR_model, LID_model, joint_model, output_model)
\ No newline at end of file
diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
index 8b213cfe..f5976ee1 100755
--- a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
+++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
@@ -20,7 +20,7 @@ set -e
 
 
 stage=1
-ngpu=4
+ngpu=2
 config_file=default_config.sh
 interactive=false
 num_workers=""
@@ -61,13 +61,13 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2rnn_film_transducer_languageid.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -90,15 +90,15 @@ if [ $stage -le 2 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    finetune_wav2vec2transducer_languageid.py $nnet_type \
+    finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -123,13 +123,13 @@ if [ $stage -le 3 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s3_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    finetune_wav2vec2transducer.py $nnet_type \
+    finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
similarity index 60%
rename from hyperion/bin/finetune_wav2vec2transducer_languageid.py
rename to hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
index 68d8dacf..22808dbd 100755
--- a/hyperion/bin/finetune_wav2vec2transducer_languageid.py
+++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
@@ -9,7 +9,7 @@
 import sys
 import time
 from pathlib import Path
-
+import gc
 import k2
 import numpy as np
 import torch
@@ -21,15 +21,23 @@
 from hyperion.torch.metrics import CategoricalAccuracy
 from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
                                    HFWav2Vec2RNNTransducer,
-                                   HFWav2Vec2RNNTransducerResnet1D)
+                                   HFWav2Vec2RNNFiLMTransducer,
+                                   HFWav2Vec2RNNTransducerResnet1D,
+                                   HFWav2Vec2RNNFiLMTransducerResnet1D)
 from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer
 from hyperion.torch.utils import ddp
 from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
                           namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
+
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
 model_dict = {
     "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D,
+    "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D,
     
 }
 
@@ -99,94 +107,63 @@ def init_data(partition, rank, num_gpus, **kwargs):
     data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_language_collate)
     return data_loader
 
-
-def check_update_parameters(joint_state_dict, new_joint_state_dict, rank):
-    unchanged_parameters = []
-    changed_parameters = []
-    unloaded_parameters = []
-    for name, param in joint_state_dict.items():
-        new_param = new_joint_state_dict[name].to(param.device)
-        if torch.all(torch.eq(param, new_param)):
-            unchanged_parameters.append(name)
-        else:
-            changed_parameters.append(name)
-    # logging
-    if rank == 0:
-        logging.info("Unchanged parameters: {}".format(unchanged_parameters))
-        logging.info("Changed parameters: {}".format(changed_parameters))
-
-
-def remove_module_from_state_dict(state_dict):
-    new_state_dict = {}
-    for name, param in state_dict.items():
-        if name.startswith("module."):
-            new_state_dict[name[len("module."):]] = param
-        else:
-            new_state_dict[name] = param
-    return new_state_dict
-
-
-def copy_model_parameters(joint_model, wav2transducer_state_dict, wav2lid_state_dict, rank):
-    joint_state_dict = joint_model.state_dict()
-    wav2transducer_state_dict = remove_module_from_state_dict(wav2transducer_state_dict)
-    wav2lid_state_dict = remove_module_from_state_dict(wav2lid_state_dict)  
-
-
-    hf_feats_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name}
-    transducer_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name}
-    languageid_update_state_dict = {name: param for name, param in wav2lid_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name}
-    
-    new_joint_state_dict = joint_state_dict.copy()
-    new_joint_state_dict.update(hf_feats_update_state_dict)
-    new_joint_state_dict.update(transducer_update_state_dict)
-    new_joint_state_dict.update(languageid_update_state_dict)
-    
-    new_joint_state_dict["transducer_fuser"] = wav2transducer_state_dict["feat_fuser"]
-    new_joint_state_dict["languageid_fuser"] = wav2lid_state_dict["feat_fuser"]
-    
-
-    check_update_parameters(joint_state_dict, new_joint_state_dict, rank)
-    joint_model.load_state_dict(new_joint_state_dict)
-
-def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs):
-    # load pretrained models
-    model_wav2transducer = torch.load(in_model_transducer)
-    model_wav2lid = torch.load(in_model_lid) 
-    if rank == 0:
-        logging.info("init joint model")
-        logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"]))
-        logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"]))
-        logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"]))
-        logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"]))
-        logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"]))
-        logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"]))
-
-    # init joint model
-    model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], 
-                        transducer=model_wav2transducer["model_cfg"]["transducer"], 
-                        languageid=model_wav2lid["model_cfg"]["languageid"],
-                        feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"],
-                        feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"],
-                        feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"],
-                        loss_weight_transducer=kwargs["model"]["loss_weight_transducer"],
-                        loss_weight_lid=kwargs["model"]["loss_weight_lid"],
-                        lid_length=kwargs["model"]["lid_length"],
-                        )
-
-    copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank)
-
-
-    # add finetune args
+def init_model(num_classes, loss_class_weight, in_model_file, rank, model_class, **kwargs):
     model_args = model_class.filter_finetune_args(**kwargs["model"])
-
     # model_args = model_class.filter_args(**kwargs["model"])
     if rank == 0:
         logging.info("model network ft args={}".format(model_args))
-    model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"]
+    model_args["languageid"]["num_classes"] = num_classes
+    # model_args["loss_class_weight"] = loss_class_weight
+    model = TML.load(in_model_file)
+    logging.info(model_args)
     model.change_config(**model_args)
     if rank == 0:
         logging.info("model={}".format(model))
     return model
+# def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs):
+#     # load pretrained models
+#     model_wav2transducer = torch.load(in_model_transducer)
+#     model_wav2lid = torch.load(in_model_lid) 
+#     if rank == 0:
+#         logging.info("init joint model")
+#         logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"]))
+#         logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"]))
+#         logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"]))
+#         logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"]))
+#         logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"]))
+#         logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"]))
+
+#     # init joint model
+#     model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], 
+#                         transducer=model_wav2transducer["model_cfg"]["transducer"], 
+#                         languageid=model_wav2lid["model_cfg"]["languageid"],
+#                         feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"],
+#                         feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"],
+#                         feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"],
+#                         loss_weight_transducer=kwargs["model"]["loss_weight_transducer"],
+#                         loss_weight_lid=kwargs["model"]["loss_weight_lid"],
+#                         lid_length=kwargs["model"]["lid_length"],
+#                         )
+
+#     copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank)
+
+
+#     # add finetune args
+#     model_args = model_class.filter_finetune_args(**kwargs["model"])
+
+#     # model_args = model_class.filter_args(**kwargs["model"])
+#     if rank == 0:
+#         logging.info("model network ft args={}".format(model_args))
+#     model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"]
+#     model.change_config(**model_args)
+#     if rank == 0:
+#         logging.info("model={}".format(model))
+
+#     model_wav2transducer = None
+#     model_wav2lid = None
+#     gc.collect()
+#     torch.cuda.empty_cache()
+#     return model
 
 
 def train_model(gpu_id, args):
@@ -199,19 +176,22 @@ def train_model(gpu_id, args):
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
 
-    # ddp_args = ddp.filter_ddp_args(**kwargs)
-    # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
-    # kwargs["rank"] = rank
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
 
-    # for Debug
-    rank = 0
-    kwargs["rank"] = 0
-    device = torch.device("cuda:0")
-    world_size=1
+    # # for Debug
+    # rank = 0
+    # kwargs["rank"] = 0
+    # device = "cpu"
+    # world_size=1
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
-    model = init_model(**kwargs)
+    # model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+    model = init_model(list(train_loader.dataset.num_classes.values())[0],
+                       train_loader.batch_sampler.class_info["weights"],
+                        **kwargs)
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
@@ -280,8 +260,10 @@ def make_parser(model_class):
     )
 
 
-    parser.add_argument("--in-model-transducer", required=True)
-    parser.add_argument("--in-model-lid", required=True)
+    # parser.add_argument("--in-model-transducer", required=True)
+    # parser.add_argument("--in-model-lid", required=True)
+    parser.add_argument("--in-model-file", required=True)
+
     model_class.add_finetune_args(parser, prefix="model")
     # model_class.add_class_args(parser, prefix="model")
     Trainer.add_class_args(
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 2e354031..0d1cf332 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -112,7 +112,7 @@ def _load_legacy_durations(self, time_durs_file):
         time_durs = SegmentSet.load(time_durs_file)
         self.seg_set["duration"] = time_durs.loc[
             self.seg_set["id"]
-        ].class_id.values.astype(np.float, copy=False)
+        ].class_id.values.astype(float, copy=False)
 
     def _load_bpe_model(self, bpe_model, is_val):
         if self.rank == 0:
diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py
index 00ee1a10..0a1a1c00 100644
--- a/hyperion/torch/layer_blocks/film_blocks.py
+++ b/hyperion/torch/layer_blocks/film_blocks.py
@@ -34,7 +34,7 @@ def forward(self, x, lang_condition):
 
 
 class RNNWithFiLM(nn.Module):
-    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh"):
+    def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh", film_cond_type="one-hot"):
         super(RNNWithFiLM, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -43,12 +43,18 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size,
         self.batch_first = batch_first 
         self.rnn_type = rnn_type
         self.film_type = film_type
+        self.film_cond_type = film_cond_type
+
         if self.rnn_type == "lstm":
             self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
         elif self.rnn_type == "gru":
             self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)])
 
-        self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)])
+        if self.film_cond_type == "one-hot":
+            self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)])
+        else:
+            self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)])
+            self.lid_films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)])
 
         self.dropout_layer = nn.Dropout(dropout)
 
@@ -59,8 +65,13 @@ def forward(self, x, states, lang_condition):
             rnns = self.lstms
         elif self.rnn_type == "gru":
             rnns = self.grus
+            
+        if self.film_cond_type == "one-hot":
+            films = self.films
+        else:
+            films = self.lid_films
 
-        for i, (rnn, film) in enumerate(zip(rnns, self.films)):
+        for i, (rnn, film) in enumerate(zip(rnns, films)):
             if states:
                 x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0)))
             else:
diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py
index 02a9dfdf..2c6d8d48 100644
--- a/hyperion/torch/layer_blocks/transducer_film_joiner.py
+++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py
@@ -21,7 +21,7 @@ class TransducerFiLMJoiner(nn.Module):
       vocab_size: vocabulary size
     """
 
-    def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear"):
+    def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear", film_cond_type="one-hot"):
         
         super().__init__()
         self.enc_feats = enc_feats
@@ -32,7 +32,18 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size:
         self.enc_proj = nn.Linear(enc_feats, hid_feats)
         self.pred_proj = nn.Linear(pred_feats, hid_feats)
         self.output = nn.Linear(hid_feats, vocab_size)
-        self.film = FiLM(hid_feats, condition_size, film_type)
+
+        self.film_cond_type = film_cond_type
+
+
+        if self.film_cond_type == "one-hot":
+            self.film = FiLM(hid_feats, condition_size, film_type)
+        else:
+            self.film = FiLM(hid_feats, condition_size, film_type)
+            self.lid_film = FiLM(hid_feats, condition_size, film_type)
+
+        # self.film = FiLM(hid_feats, condition_size, film_type)
+
         
     def get_config(self):
         config = {
@@ -69,8 +80,11 @@ def forward(self,
         else:
             x = enc_out + pred_out
 
-        x = self.film(x, lang_condition)
-        
+        if self.film_cond_type == "one-hot":
+            x = self.film(x, lang_condition)
+        else:
+            x = self.lid_film(x, lang_condition)
+            
         x = torch.tanh(x)
         logits = self.output(x)
         return logits
diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py
index dc7a7ae4..42272051 100644
--- a/hyperion/torch/layer_blocks/transducer_film_predictor.py
+++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py
@@ -39,6 +39,7 @@ def __init__(self,
                  rnn_dropout_rate: float = 0.0,
                  rnn_type: str = "lstm",
                  film_type: str = "linear",
+                 film_cond_type: str = "one-hot",
                  blank_id: int = 0):
         super().__init__()
         self.embedding = nn.Embedding(
@@ -56,7 +57,8 @@ def __init__(self,
                 condition_size=condition_size,
                 batch_first=True,
                 rnn_type=rnn_type,
-                film_type=film_type
+                film_type=film_type,
+                film_cond_type=film_cond_type
             )
         elif rnn_type in ["lstm_residual","gru_residual"]:
             self.rnn = RNNWithFiLMResidual(
@@ -67,7 +69,8 @@ def __init__(self,
                 condition_size=condition_size,
                 batch_first=True,
                 rnn_type=rnn_type,
-                film_type=film_type
+                film_type=film_type,
+                film_cond_type=film_cond_type
             )
         else:
             raise Exception(f"Unknown RNN type {rnn_type}")
@@ -101,6 +104,7 @@ def get_config(self):
             "rnn_dropout_rate": self.rnn_dropout_rate,
             "rnn_type": self.rnn_type,
             "film_type": self.film_type,
+            "film_cond_type": self.film_cond_type,
             "blank_id": self.blank_id,
         }
         return config
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
index 8e29bc84..0322543d 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -46,6 +46,7 @@ def __init__(self,
                  loss_class_weight_exp= 1.0,
                  loss_weight_transducer: float = 0.005,
                  loss_weight_lid: float = 1.0,
+                 loss_weight_embed: float = 0.005,
                  lid_length: float = 3.0,
                  ):
 
@@ -95,24 +96,28 @@ def __init__(self,
 
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
+        self.loss_weight_embed = loss_weight_embed
         self.lid_length = lid_length
         self._hf_context = contextlib.nullcontext()
-        self.transducer_fuser, self.films = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer)
-        self.languageid_fuser, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid)
+        self.transducer_fuser, self.film, self.lid_film = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer)
+        self.languageid_fuser, _, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid)
 
     def _make_fuser(self, method, start):
         feat_fuser = None
-        films = None
+        film = None
+        lid_film = None
         if method == "last":
-            return feat_fuser, films
+            return feat_fuser, None, None
         num_layers = self.hf_feats.num_encoder_layers + 1 - start
         layer_dim = self.hf_feats.hidden_size
         if method == "film-weighted-avg":
-            films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
+            film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
+            lid_film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)])
             feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif method == "film-fused-feature":
             feat_fuser = nn.Parameter(torch.zeros(num_layers))
             film = FiLM(layer_dim, self.transducer.decoder.condition_size)
+            lid_film = FiLM(layer_dim, self.transducer.decoder.condition_size)
         elif method == "weighted-avg":
             feat_fuser = nn.Parameter(torch.zeros(num_layers))
         elif method == "linear":
@@ -124,9 +129,9 @@ def _make_fuser(self, method, start):
                                         layer_dim,
                                         bias=False)
 
-        return feat_fuser, films
+        return feat_fuser, film, lid_film
 
-    def _fuse_transducer_hid_feats(self, hid_feats, lang):
+    def _fuse_transducer_hid_feats(self, hid_feats, lang_condition):
         """Fuses the hidden features from the Wav2Vec model.
 
         Args:
@@ -141,10 +146,10 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang):
             return hid_feats[0]
 
         if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]:
-            lang_condition = self.transducer.decoder.lang_embedding(lang)
+            lang_condition = self.transducer.decoder.lang_embedding(lang_condition)
         hid_feats = hid_feats[self.feat_fusion_start_transducer:]
         if self.feat_fusion_method_transducer == "film-weighted-avg":
-            film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films)))
+            film_hid_feats = tuple(self.lid_film[i](hid_feats[i], lang_condition) for i in range(len(self.lid_film)))
             film_hid_feats = torch.stack(film_hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
             feats = torch.sum(film_hid_feats * norm_weights, dim=-1)
@@ -152,7 +157,7 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang):
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
             feats = torch.sum(hid_feats * norm_weights, dim=-1)
-            feats = self.films(feats, lang_condition)
+            feats = self.lid_film(feats, lang_condition)
         elif self.feat_fusion_method_transducer == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
@@ -223,6 +228,39 @@ def forward_lid_feats(self,
 
         return feats, hid_feats, feat_lengths
             
+    def compute_embed_loss(self, lang_embed, languageid):
+        # comput the loss for the embeding between the film and lid_film
+        lang_condition = self.transducer.decoder.lang_embedding(languageid)
+
+        # for the encoder
+        film_scale = self.film.linear_scale(lang_condition)
+        lid_film_scale = self.lid_film.linear_scale(lang_embed)
+        film_shift = self.film.linear_shift(lang_condition)
+        lid_film_shift = self.lid_film.linear_shift(lang_embed)
+        loss_embed_encode = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift))
+
+        # for the predictor
+        loss_embed_predictor = 0
+        for i in range(2):
+            film_scale = self.transducer.decoder.predictor.rnn.films[i].linear_scale(lang_condition)
+            lid_film_scale = self.transducer.decoder.predictor.rnn.lid_films[i].linear_scale(lang_embed)
+            film_shift = self.transducer.decoder.predictor.rnn.films[i].linear_shift(lang_condition)
+            lid_film_shift = self.transducer.decoder.predictor.rnn.lid_films[i].linear_shift(lang_embed)
+            loss_embed_predictor += torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift))
+            
+
+        # for the joiner
+        film_scale = self.transducer.decoder.joiner.film.linear_scale(lang_condition)
+        lid_film_scale = self.transducer.decoder.joiner.lid_film.linear_scale(lang_embed)
+        film_shift = self.transducer.decoder.joiner.film.linear_shift(lang_condition)
+        lid_film_shift = self.transducer.decoder.joiner.lid_film.linear_shift(lang_embed)
+        loss_embed_joiner = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift))
+
+
+        loss_embed = loss_embed_encode + loss_embed_predictor + loss_embed_joiner
+
+        return loss_embed
+
     def forward(
         self,
         x,
@@ -275,15 +313,19 @@ def forward(
 
         #loss_lid = self.loss_lid(lid_logits, languageid)
         loss_lid = self.loss_lid(output["logits"], languageid)
+        # import pdb; pdb.set_trace()
+        # logging.info(output["h_classif"])
+
+        loss_embed = self.compute_embed_loss(output["h_classif"][0], languageid)
         
         # feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C)
-        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"]) # (N, T, C)
+        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0]) # (N, T, C)
             
         trans_output = self.transducer(
             feats_transducer,
             feat_lengths,
             text,
-            output["h_classif"]
+            output["h_classif"][0]
             # lid_logits
         )
 
@@ -292,9 +334,10 @@ def forward(
                 f.transpose(1, 2) for i, f in enumerate(hid_feats)
                 if i in return_feat_layers
             ]
-        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, 
+        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed, 
                                                 loss_transducer=trans_output.loss, 
                                                 loss_lid=loss_lid,
+                                                loss_embed=loss_embed,
                                                 loss_transducer_simple=trans_output.loss_simple, 
                                                 loss_transducer_pruned=trans_output.loss_pruned,
                                                 h_feats=trans_output.h_feats,
@@ -347,9 +390,9 @@ def infer(self,
                                   
         return text, lid
 
-    def unfreeze_film(self):
+    def unfreeze_lid_film(self):
         for name, param in self.named_parameters():
-            if "film" in name:
+            if "lid_film" in name:
                 logging.info(f"unfreezing {name}")
                 param.requires_grad = True
 
@@ -380,11 +423,13 @@ def set_train_mode(self, mode):
             self.freeze()
         elif mode in ["ft-film", "ft-film-grad"]:
             self.freeze()
-            self.unfreeze_film()
+            self.unfreeze_lid_film()
         elif mode in ["ft-transducer", "ft-transducer-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
             self.freeze_feat_fuser()
+            self.freeze_film()
+            self.unfreeze_lid_film()
         elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
@@ -451,6 +496,7 @@ def filter_args(**kwargs):
             "loss_class_weight_exp",
             "loss_weight_transducer",
             "loss_weight_lid",
+            "loss_weight_embed",
             "languageid",
         )
         args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
@@ -476,6 +522,7 @@ def get_config(self):
             "loss_class_weight_exp": self.loss_class_weight_exp,
             "loss_weight_transducer": self.loss_weight_transducer,
             "loss_weight_lid": self.loss_weight_lid,
+            "loss_weight_embed": self.loss_weight_embed,
             "lid_length": self.lid_length,
         }
 
@@ -565,6 +612,15 @@ def add_class_args(parser, prefix=None, skip=set()):
             """,
         )
 
+        parser.add_argument(
+            "--loss-weight-embed",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the embedding loss
+            """,
+        )
+
         parser.add_argument(
             "--lid-length",
             default=3.0,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index 8c7d54d7..d8374e77 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -22,6 +22,7 @@ class RNNTransducerLanguageIDOutput(HypDataClass):
     loss: torch.Tensor  # Total loss
     loss_transducer: torch.Tensor  # Loss from the transducer
     loss_lid: torch.Tensor  # Loss from the language ID
+    loss_embed: Optional[torch.Tensor] = None  # Loss from the embedding
     loss_transducer_simple: Optional[torch.Tensor] = None  # Simple loss from the transducer, if available
     loss_transducer_pruned: Optional[torch.Tensor] = None  # Pruned loss from the transducer, if available
     h_feats: Optional[List[torch.Tensor]] = None  # Hidden features, if available
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
index e012f17a..4215ea1d 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
@@ -47,6 +47,7 @@ def __init__(
         loss_class_weight_exp: float = 1.0,
         loss_weight_transducer: float = 0.005,
         loss_weight_lid: float = 1.0,
+        loss_weight_embed: float = 0.005,
         lid_length: float = 3.0,
     ):
 
@@ -81,6 +82,7 @@ def __init__(
                         loss_class_weight_exp=loss_class_weight_exp,
                         loss_weight_transducer=loss_weight_transducer,
                         loss_weight_lid=loss_weight_lid,
+                        loss_weight_embed=loss_weight_embed,
                         lid_length=lid_length)
                             
                             
@@ -117,8 +119,11 @@ def filter_finetune_args(**kwargs):
         base_args = {}
 
         valid_args = (
+            "loss_lid_type",
+            "loss_class_weight_exp",
             "loss_weight_transducer",
             "loss_weight_lid",
+            "loss_weight_embed",
             "lid_length",
         )
         child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
@@ -134,6 +139,22 @@ def add_finetune_args(parser, prefix=None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
+        parser.add_argument(
+            "--loss-lid-type",
+            default="weightedCE",
+            type=str,
+            help="""
+            The type of the loss for language id
+            """,
+        )
+        parser.add_argument(
+            "--loss-class-weight-exp",
+            default=1.0,
+            type=float,
+            help="""
+            The exponent of the class weight for language id
+            """,
+        )
 
         parser.add_argument(
             "--loss-weight-transducer",
@@ -153,6 +174,15 @@ def add_finetune_args(parser, prefix=None):
             """,
         )
 
+        parser.add_argument(
+            "--loss-weight-embed",
+            default=0.005,
+            type=float,
+            help="""
+            The weight of the embedding loss
+            """,
+        )
+
         parser.add_argument(
             "--lid-length",
             default=3.0,
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index f2cfad35..9d030ae7 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -78,8 +78,8 @@ def __init__(
         pruned_warmup_steps: int = 2000,
         langs_size: int = 13,
         condition_size: int = 64,
-        film_cond_type: str = "one-hot",
         film_type: str = "linear",
+        film_cond_type: str = "one-hot",
     ):
 
         super().__init__()
@@ -105,13 +105,10 @@ def __init__(
         self._make_predictor()
         self._make_joiner()
         # make embedding layer for language id
-        if self.film_cond_type == "one-hot":
-            self.lang_embedding = nn.Embedding(langs_size, condition_size)
-        elif self.film_cond_type == "lid_pred":
-            self.lang_embedding = nn.Linear(langs_size, condition_size)
-        elif self.film_cond_type == "lid_pred_embed":
-            # self.lang_embedding = nn.Linear(langs_size, condition_size)
-            pass
+        self.lang_embedding = nn.Embedding(langs_size, condition_size)
+        if self.film_cond_type == "lid_pred":
+            self.lid_lang_embedding = nn.Linear(langs_size, condition_size)
+
         if self.rnnt_loss == "k2_pruned":
             self.simple_am_proj = nn.Linear(in_feats, vocab_size)
             self.simple_lm_proj = nn.Linear(self.predictor.out_feats,
@@ -129,7 +126,7 @@ def _make_predictor(self):
         if pred_type == "rnn":
             pred_args = filter_func_args(RNNPredictor.__init__,
                                          self.predictor_args)
-            self.predictor = RNNPredictor(**pred_args)
+            self.predictor = RNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type)
         # elif pred_type == "conv":
         #     pred_args = filter_func_args(ConvPredictor.__init__,
         #                                  self.predictor_args)
@@ -145,7 +142,7 @@ def _make_joiner(self):
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
             self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats,
-                                 self.vocab_size, self.condition_size, self.film_type)
+                                 self.vocab_size, self.condition_size, film_type=self.film_type, film_cond_type=self.film_cond_type)
         elif joiner_type == "original_joiner":
             pred_feats = self.predictor_args["out_feats"]
             hid_feats = self.joiner_args["hid_feats"]
@@ -309,11 +306,13 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor,
         return loss, loss_simple, loss_pruned
 
     def forward(
-        self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor
+        self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang_embedding: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         # embed lang
-        if self.film_cond_type in ["one-hot", "lid_pred"]:
-            lang_embedding = self.lang_embedding(lang)
+        if self.film_cond_type == ["one-hot"]:
+            lang_embedding = self.lang_embedding(lang_embedding)
+        elif self.film_cond_type == ["lid_pred"]:
+            lang_embedding = self.lid_lang_embedding(lang_embedding)
         # get y_lengths
         row_splits = y.shape.row_splits(1)
         y_lengths = row_splits[1:] - row_splits[:-1]
@@ -346,8 +345,13 @@ def decode(self,
                max_sym_per_utt: int = 1000, ) -> List[int]:
 
         # embed lang
-        if self.film_cond_type in ["one-hot", "lid_pred"]:
+        # if self.film_cond_type in ["one-hot", "lid_pred"]:
+        #     lang_embedding = self.lang_embedding(lang)
+
+        if self.film_cond_type == ["one-hot"]:
             lang_embedding = self.lang_embedding(lang)
+        elif self.film_cond_type == ["lid_pred"]:
+            lang_embedding = self.lid_lang_embedding(lang)
         if method == "time_sync_beam_search":
             return self.decode_time_sync_beam_search(x,
                                                      lang_embedding,
@@ -730,10 +734,7 @@ def add_pred_args(parser):
             help=
             """type of recurrent network for thep predictor in [lstm, gru]""")
 
-        pred_parser.add_argument("--film-type",
-                                    default="linear",
-                                    choices=["linear", "tanh"],
-                                    help=("type of the FiLM layer"))
+                                    
 
 
         pred_parser.add_argument("--num-layers",

From 28e61e3a8998df94e9ecc4cfcff66356f6d149c1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-71-145.ec2.internal>
Date: Sun, 2 Jul 2023 07:54:52 +0000
Subject: [PATCH 65/89] update lid-film-asr training

---
 ...ucer_ecapadnn512x3_1layer_stage1_v2.0.yaml |   1 +
 ...ucer_ecapadnn512x3_1layer_stage1_v2.2.yaml | 140 ++++++++++++++++++
 ...ucer_ecapadnn512x3_1layer_stage1_v2.3.yaml | 140 ++++++++++++++++++
 ...ucer_ecapadnn512x3_1layer_stage2_v2.1.yaml |   9 +-
 ...ucer_ecapadnn512x3_1layer_stage2_v2.2.yaml |  94 ++++++++++++
 ...ucer_ecapadnn512x3_1layer_stage3_v2.1.yaml |  92 ++++++++++++
 ...uned_filmed_transducer_lid_v2.1_13langs.sh |   7 +-
 ...uned_filmed_transducer_lid_v2.2_13langs.sh |  42 ++++++
 .../models/transducer/rnn_film_transducer.py  |  13 ++
 .../hf_wav2rnn_film_transducer_languageid.py  |  90 +++++++++--
 ..._wav2vec2rnn_film_transducer_languageid.py |  32 +++-
 .../hf_wav2vec2rnn_transducer_languageid.py   |   4 +-
 12 files changed, 639 insertions(+), 25 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
index 0931c052..6c06c29b 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml
@@ -108,6 +108,7 @@ model:
 
   loss_weight_transducer: 1.0
   loss_weight_lid: 0.0
+  loss_weight_embed: 0.05
   lid_length: 3.0
   
   feat_fusion_method_transducer: film-fused-feature
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml
new file mode 100644
index 00000000..7347e8b4
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml
@@ -0,0 +1,140 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m   
+  transducer:
+    decoder:
+      film_cond_type: lid_pred_embed
+      reduction: mean
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  languageid:
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+      resb_channels:
+        - 512
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+      dropout_rate: 0.2
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: arc-softmax
+    cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  loss_weight_embed: 10
+  lid_length: 3.0
+  
+  feat_fusion_method_transducer: film-fused-feature
+  feat_fusion_method_lid: weighted-avg
+  feat_fusion_start_transducer: 2
+  feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml
new file mode 100644
index 00000000..f7a430a7
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml
@@ -0,0 +1,140 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m   
+  transducer:
+    decoder:
+      film_cond_type: lid_pred_embed
+      reduction: mean
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  languageid:
+    resnet_enc:
+      in_feats: 1024
+      in_conv_channels: 1024
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+        - 1
+      resb_channels:
+        - 512
+      resb_kernel_sizes:
+        - 3
+      resb_dilations:
+        - 2
+      resb_strides:
+        - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 8
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 3072
+      hid_act: swish
+      dropout_rate: 0.2
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 128
+    loss_type: arc-softmax
+    cos_scale: 32.0
+    margin: 0.
+    margin_warmup_epochs: 5
+    intertop_margin: 0.
+    dropout_rate: 0.3
+    hid_act: swish
+
+  loss_lid_type: weightedCE
+  loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  loss_weight_embed: 20
+  lid_length: 3.0
+  
+  feat_fusion_method_transducer: film-fused-feature
+  feat_fusion_method_lid: weighted-avg
+  feat_fusion_start_transducer: 2
+  feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-film
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
index 377ea296..716a9d8f 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
@@ -49,18 +49,19 @@ data:
 model:
   transducer:
     decoder:
+      reduction: mean
       prune_range: 15
       override_dropouts: false
   languageid:
     cos_scale: 32.0
 
-  loss_lid_type: weightedCE
-  loss_class_weight_exp: 1.0 # 0~1
+  # loss_lid_type: weightedCE
+  # loss_class_weight_exp: 1.0 # 0~1
 
   loss_weight_transducer: 1.0
   loss_weight_lid: 10.0
   loss_weight_embed: 10
-  lid_length: 3.0
+  # lid_length: 3.0
   
   # feat_fusion_method_transducer: film-fused-feature
   # feat_fusion_method_lid: weighted-avg
@@ -87,5 +88,5 @@ trainer:
   epochs: 120
   # eff_batch_size: 1024
   eff_batch_size: 128
-  train_mode: ft-film
+  train_mode: freeze-gt-film
  
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
new file mode 100644
index 00000000..2f625da0
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
@@ -0,0 +1,94 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  transducer:
+    decoder:
+      reduction: mean
+      prune_range: 15
+      override_dropouts: false
+  languageid:
+    cos_scale: 32.0
+
+  # loss_lid_type: weightedCE
+  # loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  loss_weight_embed: 10
+  loss_reg_weight_transducer: 0.5
+  loss_reg_weight_lid: 0.0
+  # lid_length: 3.0
+  
+  # feat_fusion_method_transducer: film-fused-feature
+  # feat_fusion_method_lid: weighted-avg
+  # feat_fusion_start_transducer: 2
+  # feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-transducer
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml
new file mode 100644
index 00000000..a7be4925
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml
@@ -0,0 +1,92 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.05
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20.
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model:
+  transducer:
+    decoder:
+      reduction: mean
+      prune_range: 15
+      override_dropouts: false
+  languageid:
+    cos_scale: 32.0
+
+  # loss_lid_type: weightedCE
+  # loss_class_weight_exp: 1.0 # 0~1
+
+  loss_weight_transducer: 1.0
+  loss_weight_lid: 0.0
+  loss_weight_embed: 10
+  # lid_length: 3.0
+  
+  # feat_fusion_method_transducer: film-fused-feature
+  # feat_fusion_method_lid: weighted-avg
+  # feat_fusion_start_transducer: 2
+  # feat_fusion_start_lid: 2
+
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 12000
+    hold_steps: 10000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: ft-transducer
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
index b0e39914..d5d72490 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh
@@ -27,7 +27,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0009.pth
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
 nnet_s2_args=""
@@ -35,9 +35,8 @@ nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
 nnet_s2=$nnet_s2_dir/model_ep0020.pth
 
-nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.1.yaml
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml
 nnet_s3_args=""
 nnet_s3_name=${nnet_name}.s3
-nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name
 nnet_s3=$nnet_s3_dir/model_ep0002.pth
-nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh
new file mode 100644
index 00000000..f4ccf18e
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0006.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.2.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py
index 68066442..6f82e101 100644
--- a/hyperion/torch/models/transducer/rnn_film_transducer.py
+++ b/hyperion/torch/models/transducer/rnn_film_transducer.py
@@ -193,6 +193,19 @@ def add_class_args(parser, prefix=None, skip=set()):
         if prefix is not None:
             outer_parser.add_argument("--" + prefix,
                                       action=ActionParser(parser=parser))
+    
+    def get_regularization_loss(self):
+        reg_loss = 0.0
+        total_params = 0
+
+        for param in self.parameters():
+            reg_loss += torch.norm(param)**2
+            total_params += torch.numel(param)
+
+        reg_loss = (reg_loss) / total_params
+
+        return reg_loss
+
 
     def change_config(
         self,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
index 0322543d..7daeddcb 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -47,6 +47,8 @@ def __init__(self,
                  loss_weight_transducer: float = 0.005,
                  loss_weight_lid: float = 1.0,
                  loss_weight_embed: float = 0.005,
+                 loss_reg_weight_transducer: float = 0.0,
+                 loss_reg_weight_lid: float = 0.0,
                  lid_length: float = 3.0,
                  ):
 
@@ -97,6 +99,8 @@ def __init__(self,
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
         self.loss_weight_embed = loss_weight_embed
+        self.loss_reg_weight_transducer = loss_reg_weight_transducer
+        self.loss_reg_weight_lid = loss_reg_weight_lid
         self.lid_length = lid_length
         self._hf_context = contextlib.nullcontext()
         self.transducer_fuser, self.film, self.lid_film = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer)
@@ -334,7 +338,18 @@ def forward(
                 f.transpose(1, 2) for i, f in enumerate(hid_feats)
                 if i in return_feat_layers
             ]
-        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed, 
+
+        loss_reg_lid = 0
+        if self.loss_reg_weight_lid > 0:
+            loss_reg_lid = self.languageid.get_regularization_loss()
+            
+        loss_reg_transducer = 0
+        if self.loss_reg_weight_transducer > 0:
+            loss_reg_transducer = self.transducer.get_regularization_loss()
+
+
+
+        output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed + self.loss_reg_weight_lid * loss_reg_lid + self.loss_reg_weight_transducer * loss_reg_transducer, 
                                                 loss_transducer=trans_output.loss, 
                                                 loss_lid=loss_lid,
                                                 loss_embed=loss_embed,
@@ -396,16 +411,29 @@ def unfreeze_lid_film(self):
                 logging.info(f"unfreezing {name}")
                 param.requires_grad = True
 
-    # def freeze_feat_fuser(self):
-    #     if self.feat_fuser is None:
-    #         return
+    def freeze_lid(self):
+        self.languageid.freeze()
 
-    #     if self.feat_fusion_method_transducer == "weighted-avg":
-    #         self.feat_fuser.requires_grad = False
-    #         return
+    def freeze_film(self):
+        for name, param in self.named_parameters():
+            # logging.info(f"parameter {name}")
+            if "film" in name and "lid_film" not in name:
+                logging.info(f"freezing {name}")
+                param.requires_grad = False
+            if "lang_embedding" in name:
+                logging.info(f"freezing {name}")
+                param.requires_grad = False
+
+    def freeze_lid_feat_fuser(self):
+        if self.languageid_fuser is None:
+            return
 
-    #     for param in self.feat_fuser.parameters():
-    #         param.requires_grad = False
+        if self.feat_fusion_method_lid == "weighted-avg":
+            self.languageid_fuser.requires_grad = False
+            return
+
+        for param in self.languageid_fuser.parameters():
+            param.requires_grad = False
 
     def freeze_hf_feats(self):
         self.hf_feats.freeze()
@@ -414,11 +442,16 @@ def freeze_hf_feature_encoder(self):
         self.hf_feats.freeze_feature_encoder()
 
     def set_train_mode(self, mode):
+        logging.info("setting train mode to %s", mode)
+        logging.info("train mode was %s", self._train_mode)
         if mode == self._train_mode:
             return
 
         if mode == "full":
             self.unfreeze()
+        if mode == "freeze-gt-film":
+            self.unfreeze()
+            self.freeze_film()
         elif mode == "frozen":
             self.freeze()
         elif mode in ["ft-film", "ft-film-grad"]:
@@ -427,9 +460,10 @@ def set_train_mode(self, mode):
         elif mode in ["ft-transducer", "ft-transducer-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
-            self.freeze_feat_fuser()
             self.freeze_film()
-            self.unfreeze_lid_film()
+            self.freeze_lid_feat_fuser()
+            self.freeze_lid()
+            # self.unfreeze_lid_film()
         elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]:
             self.unfreeze()
             self.freeze_hf_feats()
@@ -455,6 +489,7 @@ def _train(self, train_mode: str):
             super()._train(train_mode)
         elif train_mode in [
                 "ft-film",
+                "freeze-gt-film",
                 "ft-transducer",
                 "hf-feats-frozen",
                 "ft-film-grad",
@@ -472,6 +507,7 @@ def valid_train_modes():
         return [
             "full",
             "frozen",
+            "freeze-gt-film",
             "ft-film",
             "ft-embed-affine",
             "ft-transducer",
@@ -497,6 +533,8 @@ def filter_args(**kwargs):
             "loss_weight_transducer",
             "loss_weight_lid",
             "loss_weight_embed",
+            "loss_reg_weight_transducer",
+            "loss_reg_weight_lid",
             "languageid",
         )
         args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
@@ -523,14 +561,24 @@ def get_config(self):
             "loss_weight_transducer": self.loss_weight_transducer,
             "loss_weight_lid": self.loss_weight_lid,
             "loss_weight_embed": self.loss_weight_embed,
+            "loss_reg_weight_transducer": self.loss_reg_weight_transducer,
+            "loss_reg_weight_lid": self.loss_reg_weight_lid,
             "lid_length": self.lid_length,
         }
 
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    def change_config(self, hf_feats, transducer, languageid):
-        logging.info("changing hf wav2transducer config")
+    def change_config(self, loss_weight_transducer, loss_weight_lid, loss_weight_embed, loss_reg_weight_transducer, loss_reg_weight_lid, lid_length, hf_feats, transducer, languageid):
+        logging.info("changing hf wav2film_transducer_languageid config")
+
+        self.loss_weight_transducer = loss_weight_transducer
+        self.loss_weight_lid = loss_weight_lid
+        self.loss_weight_embed = loss_weight_embed
+        self.lid_length = lid_length
+        self.loss_reg_weight_transducer = loss_reg_weight_transducer
+        self.loss_reg_weight_lid = loss_reg_weight_lid
+        
         self.hf_feats.change_config(**hf_feats)
         self.transducer.change_config(**transducer)
         self.languageid.change_config(**languageid)
@@ -620,6 +668,22 @@ def add_class_args(parser, prefix=None, skip=set()):
             The weight of the embedding loss
             """,
         )
+        parser.add_argument(
+            "--loss-reg-weight-transducer",
+            default=0.0,
+            type=float,
+            help="""
+            The weight of the transducer regularization loss
+            """,
+        )
+        parser.add_argument(
+            "--loss-reg-weight-lid",
+            default=0.0,
+            type=float,
+            help="""
+            The weight of the lid regularization loss
+            """,
+        )
 
         parser.add_argument(
             "--lid-length",
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
index 4215ea1d..cad64e99 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py
@@ -48,6 +48,8 @@ def __init__(
         loss_weight_transducer: float = 0.005,
         loss_weight_lid: float = 1.0,
         loss_weight_embed: float = 0.005,
+        loss_reg_weight_transducer: float = 0.0,
+        loss_reg_weight_lid: float = 0.0,
         lid_length: float = 3.0,
     ):
 
@@ -82,6 +84,8 @@ def __init__(
                         loss_class_weight_exp=loss_class_weight_exp,
                         loss_weight_transducer=loss_weight_transducer,
                         loss_weight_lid=loss_weight_lid,
+                        loss_reg_weight_transducer=loss_reg_weight_transducer,
+                        loss_reg_weight_lid=loss_reg_weight_lid,
                         loss_weight_embed=loss_weight_embed,
                         lid_length=lid_length)
                             
@@ -116,16 +120,20 @@ def add_class_args(parser, prefix=None):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        base_args = {}
 
         valid_args = (
-            "loss_lid_type",
-            "loss_class_weight_exp",
+            # "loss_lid_type",
+            # "loss_class_weight_exp",
             "loss_weight_transducer",
             "loss_weight_lid",
             "loss_weight_embed",
+            "loss_reg_weight_transducer",
+            "loss_reg_weight_lid",
             "lid_length",
         )
+
+        base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
         child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
         base_args["hf_feats"] = child_args
         child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"])
@@ -183,6 +191,24 @@ def add_finetune_args(parser, prefix=None):
             """,
         )
 
+        parser.add_argument(
+            "--loss-reg-weight-transducer",
+            default=0.0,
+            type=float,
+            help="""
+            The weight of the transducer regularization loss
+            """,
+        )
+        
+        parser.add_argument(
+            "--loss-reg-weight-lid",
+            default=0.0,
+            type=float,
+            help="""
+            The weight of the lid regularization loss
+            """,
+        )
+
         parser.add_argument(
             "--lid-length",
             default=3.0,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
index c8cd974b..4a8ca173 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
@@ -99,13 +99,15 @@ def add_class_args(parser, prefix=None):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        base_args = {}
 
         valid_args = (
             "loss_weight_transducer",
             "loss_weight_lid",
             "lid_length",
         )
+
+        base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+
         child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"])
         base_args["hf_feats"] = child_args
         child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"])

From b387dddad98594340d29b5a6db24e0a25198b617 Mon Sep 17 00:00:00 2001
From: neillu23 <neilyenjulu@gmail.com>
Date: Sun, 2 Jul 2023 14:02:20 -0400
Subject: [PATCH 66/89] update config

---
 .../v1/global_conf/config_lid_v2.2_13langs.sh |  6 +--
 .../global_conf/config_lid_v4.0_13langs_v3.sh |  2 +-
 .../v1/global_conf/config_lid_v6.0_13langs.sh |  4 +-
 .../v1/global_conf/config_lid_v6.2_13langs.sh |  4 +-
 .../v1/global_conf/config_lid_v6.3_13langs.sh |  4 +-
 .../v1/global_conf/config_lid_v6.4_13langs.sh |  4 +-
 ...uned_filmed_transducer_lid_v1.0_13langs.sh |  6 +--
 ...g_pruned_filmed_transducer_v2.0_13langs.sh |  2 +-
 ...pruned_filmed_transducer_v4.2.1_13langs.sh |  8 ++--
 ...g_pruned_filmed_transducer_v5.1_13langs.sh |  4 +-
 ...g_pruned_filmed_transducer_v6.0_13langs.sh | 11 +++--
 ...nfig_pruned_transducer_lid_v1.0_13langs.sh |  2 +-
 .../config_pruned_transducer_v4.0_13langs.sh  |  6 +--
 .../global_conf/config_transducer_v3.3_it.sh  |  2 +-
 egs/commonvoice/v1/run_004_compute_bpe.sh     | 42 +++++++++----------
 egs/commonvoice/v1/run_030_inference.sh       |  2 +
 egs/commonvoice/v1/run_031_inference_film.sh  |  2 +
 egs/commonvoice/v1/run_032_identificate.sh    |  2 +
 .../identificate_wav2vec2resnet1d.sh          |  5 ++-
 .../decode_wav2vec2rnn_film_transducer.sh     |  6 ++-
 .../decode_wav2vec2rnn_transducer.sh          |  9 ++--
 .../decode_wav2vec2rnn_transducer_lid.sh      |  8 ++++
 .../preprocess_audios_for_nnet_train.sh       |  5 +++
 hyperion/bin/identificate_wav2languageid.py   |  1 +
 hyperion/torch/layers/global_pool.py          |  9 ++++
 .../hf_wav2rnn_transducer_languageid.py       | 23 ++++++++--
 26 files changed, 115 insertions(+), 64 deletions(-)

diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
index debd9377..ec13ae3d 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
@@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
-
+test_data="br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs
 
@@ -34,7 +34,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
 nnet_s2_args=""
 nnet_s2_name=${hf_model_name}_resnet1d_v2.2_13_langs.s2
 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0020.pth
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
index 8d6cbc80..9a154499 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh
@@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs
 nnet_s1_name=$nnet_name.s3
 
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s4
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
index ebbd7fd1..28404ba5 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs
@@ -32,7 +32,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${hf_model_name}_resnet1d_v6.0_13_langs.s2
 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0020.pth
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
index 57fb5d0b..f9d932e4 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs
@@ -26,7 +26,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_resnet1d_v6.2_13_langs
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0034.pth
+nnet_s1=$nnet_s1_dir/model_ep0024.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.2.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
index d1847910..cedfb6e3 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs
@@ -26,7 +26,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_resnet1d_v6.3_13_langs
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0034.pth
+nnet_s1=$nnet_s1_dir/model_ep0033.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.3.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
index 88190921..5124da23 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs
@@ -26,7 +26,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_resnet1d_v6.4_13_langs
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0034.pth
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.4.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh
index 8d9e95d3..69dcb809 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh
@@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
-
+test_data=" ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
 
@@ -27,7 +27,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0007.pth
+nnet_s1=$nnet_s1_dir/model_ep0016.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
index 0f3845d7..b3a07306 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh
@@ -40,7 +40,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0003.pth
+nnet_s2=$nnet_s2_dir/model_ep0047.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh
index d209d421..f7480a61 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh
@@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
-
+test_data="kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio"  #ca_test_proc_audio 
+#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
 
@@ -29,13 +29,13 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2.1_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s3
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0012.pth
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s4
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0003.pth
+nnet_s2=$nnet_s2_dir/model_ep0001.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.1.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
index ab3d1ec8..09a139ab 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
@@ -29,7 +29,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.1_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s3
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s1=$nnet_s1_dir/model_ep0042.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.1.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
index 71d38168..28f381ea 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
@@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
-
+test_data="en_test_proc_audio ca_test_proc_audio" 
+#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
 
@@ -29,17 +29,16 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s3
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0003.pth
+nnet_s1=$nnet_s1_dir/model_ep0005.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s4
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0003.pth
+nnet_s2=$nnet_s2_dir/model_ep0005.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v6.0.yaml
 nnet_s3_args=""
 nnet_s3_name=${nnet_name}.s5
 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
-nnet_s3=$nnet_s3_dir/model_ep0002.pth
-nnet_s3=$nnet_s3_dir/model_ep0005.pth
+nnet_s3=$nnet_s3_dir/model_ep0011.pth
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
index aaafecc1..ffa2a057 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh
@@ -38,7 +38,7 @@ nnet_s1_args=""
 nnet_name=${hf_model_name}_rnnt_k2_pruned_transducer_ecapadnn1024x3.v1.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0007.pth
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
index 29a762fa..a809e51d 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
@@ -28,13 +28,13 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0007.pth
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0020.pth
+nnet_s2=$nnet_s2_dir/model_ep0015.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
index c0fbe9dc..b3648580 100644
--- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
@@ -25,7 +25,7 @@ nnet_name=${hf_model_name}_transducer_v3.3_it
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0015.pth
+nnet_s1=$nnet_s1_dir/model_ep0114.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh
index 617f03ae..ee14ca2b 100755
--- a/egs/commonvoice/v1/run_004_compute_bpe.sh
+++ b/egs/commonvoice/v1/run_004_compute_bpe.sh
@@ -6,10 +6,8 @@
 set -e
 
 vocab_sizes=(
-  # 5000
-  2000
-  1000
-  500
+  8000
+  16000
 )
 
 dl_dir=$PWD/download
@@ -23,14 +21,14 @@ config_file=default_config.sh
 . $config_file
 
 
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  echo "Stage 1: Dump transcripts for LM training"
-  mkdir -p data/lm
-  gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
-    | jq '.text' \
-    | sed 's:"::g' \
-    > data/lm/${language}_transcript_words.txt
-fi
+# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+#   echo "Stage 1: Dump transcripts for LM training"
+#   mkdir -p data/lm
+#   gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+#     | jq '.text' \
+#     | sed 's:"::g' \
+#     > data/lm/${language}_transcript_words.txt
+# fi
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   echo "Stage 2: Prepare BPE based lang"
@@ -44,16 +42,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
     echo "!SIL 1" >> $lang_dir/words.txt
     echo "<UNK> 2" >> $lang_dir/words.txt
 
-    # Add regular words to words.txt
-    gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
-      | jq '.text' \
-      | sed 's:"::g' \
-      | sed 's: :\n:g' \
-      | sort \
-      | uniq \
-      | sed '/^$/d' \
-      | awk '{print $0,NR+2}' \
-      >> $lang_dir/words.txt
+    # # Add regular words to words.txt
+    # gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+    #   | jq '.text' \
+    #   | sed 's:"::g' \
+    #   | sed 's: :\n:g' \
+    #   | sort \
+    #   | uniq \
+    #   | sed '/^$/d' \
+    #   | awk '{print $0,NR+2}' \
+    #   >> $lang_dir/words.txt
 
     # Add remaining special word symbols expected by LM scripts.
     num_words=$(cat $lang_dir/words.txt | wc -l)
diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh
index ec5b140b..72da282e 100755
--- a/egs/commonvoice/v1/run_030_inference.sh
+++ b/egs/commonvoice/v1/run_030_inference.sh
@@ -34,6 +34,8 @@ fi
 transducer_dir=exp/transducer/$nnet_name
 
 
+rm -f $transducer_dir/overall_wer_char.txt
+
 # test_data=test_clean
 
 
diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh
index 7b796107..d5749eb4 100755
--- a/egs/commonvoice/v1/run_031_inference_film.sh
+++ b/egs/commonvoice/v1/run_031_inference_film.sh
@@ -34,6 +34,8 @@ fi
 transducer_dir=exp/transducer/$nnet_name
 
 
+rm -f $transducer_dir/overall_wer_char.txt
+
 # test_data=test_clean
 
 
diff --git a/egs/commonvoice/v1/run_032_identificate.sh b/egs/commonvoice/v1/run_032_identificate.sh
index a9a8cee5..76b98c34 100755
--- a/egs/commonvoice/v1/run_032_identificate.sh
+++ b/egs/commonvoice/v1/run_032_identificate.sh
@@ -34,6 +34,8 @@ fi
 
 lid_dir=exp/resnet1d/$nnet_name
 
+rm -f $lid_dir/overall_lid_score.txt
+
 # Extracts x-vectors for evaluation
 for name in $test_data  # $dev_data $test_data 
   do
diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
index 5a2bbc27..5a9a30c8 100755
--- a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
+++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh
@@ -77,8 +77,11 @@ if [ $stage -le 1 ];then
   echo "compute error rate"
 
   cat $output_dir/languageid.* > $output_dir/langs
+  python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score
 
-  python steps_lid/lid_score.py $output_dir/langs >> $output_dir/scores
+  echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt
+  cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt
+  echo " " >> $output_dir/../overall_lid_score.txt
   # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
 
   # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
index ebd6398d..17378c29 100755
--- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh
@@ -76,7 +76,11 @@ if [ $stage -le 1 ];then
   python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
 
   # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
-  # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
+  compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
   # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
 
+  echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt
+  cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt
+  echo " " >> $output_dir/../overall_wer_char
+
 fi
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
index 986c8190..18d6ad4c 100755
--- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh
@@ -69,11 +69,14 @@ if [ $stage -le 1 ];then
   python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
   python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
 
-  python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model
-  python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
+  # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model
+  # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model
 
   # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
-  # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
+  compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
   # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
 
+  echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt
+  cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt
+  echo " " >> $output_dir/../overall_wer_char
 fi
diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh
index 3bf84cbd..0363eaf1 100755
--- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh
+++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh
@@ -69,6 +69,7 @@ if [ $stage -le 1 ];then
   echo "compute wer"
   cat $output_dir/transducer.*.text > $output_dir/transducer.text
   cat $output_dir/languageid.* > $output_dir/langs
+  python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score
 
   python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text
   python steps_transducer/word2char.py $data_dir/text $data_dir/text_char
@@ -79,5 +80,12 @@ if [ $stage -le 1 ];then
   compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer
   compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char
   # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe
+  echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt
+  cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt
+  echo " " >> $output_dir/../overall_lid_score.txt
+  echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt
+  cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt
+  echo " " >> $output_dir/../overall_wer_char.txt
+
 
 fi
diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
index 8321169f..c6c3ea9f 100755
--- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
+++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
@@ -11,6 +11,7 @@ nodes=b1
 storage_name=$(date +'%m_%d_%H_%M')
 proc_opts="--remove-dc-offset"
 use_bin_vad=false
+osr=0
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -90,6 +91,10 @@ else
     fi
 fi
 
+if [ "$osr" != 0 ];then
+    args="${args} --output-sampling-rate ${osr}"
+fi
+
 $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \
     hyp_utils/conda_env.sh \
     preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \
diff --git a/hyperion/bin/identificate_wav2languageid.py b/hyperion/bin/identificate_wav2languageid.py
index 8b01ac25..37cf22e4 100755
--- a/hyperion/bin/identificate_wav2languageid.py
+++ b/hyperion/bin/identificate_wav2languageid.py
@@ -120,6 +120,7 @@ def decode_languageid(input_spec, output_spec, scp_sep, model_path, lang_file,
     device = init_device(use_gpu)
     model = load_model(model_path, device)
 
+    logging.info(nn.functional.softmax(model.feat_fuser, dim=-1))
     # load language dict form langfile by row number
     lang_dict = {}
     with open(lang_file, "r") as f:
diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py
index 8fe67792..d97b8c9e 100644
--- a/hyperion/torch/layers/global_pool.py
+++ b/hyperion/torch/layers/global_pool.py
@@ -800,6 +800,15 @@ def forward(self, x, x_lengths=None, weights=None):
                 else:
                     min_value = -1e20
                 mask = weights.eq(0)
+                # #logging mask type, shape
+                # logging.info('mask type={}, shape={}'.format(mask.dtype, mask.shape))
+                # #logging attn type, min_value type
+                # logging.info('attn type={}'.format(attn.dtype))
+                # logging.info('attn={}'.format(attn))
+                # logging.info('min_value={}'.format(min_value))
+                
+
+
                 attn = attn.masked_fill(mask, min_value)
 
             attn = nnf.softmax(attn, dim=-1)
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index d8374e77..60920a36 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -296,17 +296,32 @@ def infer(self,
           List of list of integer indexes of the recognizer's symbols.
         """
 
-        feats_transducer, _, _, feat_lengths = self.forward_feats(x, x_lengths)
+        feats_transducer, feats_languageid, _, feat_lengths = self.forward_feats(x, x_lengths)
+        # logging.info(f"feat_lengths: {feat_lengths}")
+        # logging.info(f"feats_transducer.shape: {feats_transducer.shape}")
+        # logging.info(f"feats_languageid.shape: {feats_languageid.shape}")
+        # logging.info(f"feats_transducer: {feats_transducer}")
+        # logging.info(f"feats_languageid: {feats_languageid}")
+        lid = self.languageid(
+            feats_languageid.float(),
+            feat_lengths,
+            None,
+            return_enc_layers=None,
+            return_classif_layers=None,
+            return_logits=True,
+        )
 
-        feats = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
 
-        y = self.transducer.infer(feats,
+        feats_transducer = feats_transducer.permute(0, 2, 1)  # (N, C, T) ->(N, T, C)
+
+        text = self.transducer.infer(feats_transducer,
                                   feat_lengths,
                                   decoding_method=decoding_method,
                                   beam_width=beam_width,
                                   max_sym_per_frame=max_sym_per_frame,
                                   max_sym_per_utt=max_sym_per_utt)
-        return y
+
+        return text, lid
 
     # def freeze_feat_fuser(self):
     #     if self.feat_fuser is None:

From acbfc06941ce066c99cd1a9c3de3674a0a200f39 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.rockfish.cluster>
Date: Tue, 4 Jul 2023 17:34:05 -0400
Subject: [PATCH 67/89] update joint training for ASR-LID

---
 egs/commonvoice/v1/conf/clsp.conf             | 13 ++-
 egs/commonvoice/v1/conf/slurm.conf            |  4 +-
 ...2base_rnnt_film_k2_pruned_stage2_v1.0.yaml | 17 ++--
 ...2base_rnnt_film_k2_pruned_stage3_v6.0.yaml |  4 +-
 ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml |  4 +-
 ...v2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml |  4 +-
 ...v2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml |  3 +-
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml |  2 +-
 ...c2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml |  8 +-
 ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml |  4 +-
 ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml |  5 +-
 egs/commonvoice/v1/datapath.sh                |  2 +-
 .../v1/global_conf/config_lid_v2.1_13langs.sh |  2 +-
 .../v1/global_conf/config_lid_v2.2_13langs.sh |  2 +-
 ...g_pruned_filmed_transducer_v1.0_13langs.sh |  2 +-
 ...g_pruned_filmed_transducer_v6.0_13langs.sh |  2 +-
 .../config_pruned_transducer_v1.3_13langs.sh  |  2 +-
 .../config_pruned_transducer_v4.0_13langs.sh  |  2 +-
 .../global_conf/config_transducer_v3.3_it.sh  |  3 +-
 egs/commonvoice/v1/run_004_compute_bpe.sh     | 42 ++++-----
 egs/commonvoice/v1/run_011_train_asr.sh       |  4 +-
 egs/commonvoice/v1/run_015_train_film_asr.sh  | 13 ++-
 egs/commonvoice/v1/run_020_train_asr_lid.sh   |  2 +-
 .../preprocess_audios_for_nnet_train.sh       |  2 +-
 ..._wav2vec2rnn_film_transducer_languageid.py |  2 +-
 hyperion/bin/train_wav2vec2rnn_transducer.py  |  4 +-
 ...train_wav2vec2rnn_transducer_languageid.py |  4 +-
 hyperion/bin/train_wav2vec2transducer.py      |  7 ++
 hyperion/torch/layers/global_pool.py          |  5 +
 .../hf_wav2rnn_transducer_languageid.py       | 91 ++++++++++++++++---
 .../hf_wav2vec2rnn_transducer_languageid.py   | 35 ++++++-
 31 files changed, 203 insertions(+), 93 deletions(-)

diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf
index 959c62a7..1c75f327 100644
--- a/egs/commonvoice/v1/conf/clsp.conf
+++ b/egs/commonvoice/v1/conf/clsp.conf
@@ -1,11 +1,16 @@
 
 # Default configuration
-command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
-option mem=* -l mem_free=$0,ram_free=$0
+command sbatch --export=PATH
+#command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
+option mem=* --mem-per-cpu $0
+# option mem=* -l mem_free=$0,ram_free=$0
 option mem=0          # Do not add anything to qsub_opts
 option num_threads=* -pe smp $0
 option num_threads=1  # Do not add anything to qsub_opts
 option max_jobs_run=* -tc $0
 default gpu=0
-option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*'
-option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
+option gpu=0 
+option gpu=* -p GPU-shared --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+#option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*'
+#option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
+
diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf
index 262344ea..423d9133 100644
--- a/egs/commonvoice/v1/conf/slurm.conf
+++ b/egs/commonvoice/v1/conf/slurm.conf
@@ -1,7 +1,7 @@
 # Default configuration                                                                                                                                                        
 command sbatch --export=PATH
 option name=* --job-name $0
-default time=48:00:00
+default time=24:00:00
 option time=* --time $0
 option mem=* --mem-per-cpu $0
 option mem=0
@@ -10,6 +10,6 @@ option num_threads=1 --cpus-per-task 1
 option num_nodes=* --nodes $0
 default gpu=0
 option gpu=0 
-option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 1 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
 # note: the --max-jobs-run option is supported as a special case
 # by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
index a867f12a..5a1555dd 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
@@ -10,8 +10,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 15.
-      max_audio_length: 15.
+      max_batch_length: 40.
+      max_audio_length: 20.
       min_batch_size: 1
       drop_last: false
       # for class_weighted_random_bucketing_seg_sampler
@@ -19,7 +19,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 0.3
 
     data_loader:
       num_workers: 1
@@ -34,8 +34,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 15.
-      max_audio_length: 15.
+      max_batch_length: 40.
+      max_audio_length: 20.
       min_batch_size: 1
       drop_last: true
       # for class_weighted_random_bucketing_seg_sampler
@@ -43,13 +43,14 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.5
+      num_chunks_per_seg_epoch: 1.0
     data_loader:
       num_workers: 1
 model: 
   transducer:
     decoder:
       prune_range: 15
+      override_dropouts: false
 trainer:
   optim:
     opt_type: sgd
@@ -59,8 +60,8 @@ trainer:
   lrsched:
     lrsch_type: exp_lr
     decay_rate: 0.5
-    decay_steps: 45000
-    hold_steps: 30000
+    decay_steps: 180000
+    hold_steps: 60000
     min_lr: 4e-5
     warmup_steps: 6000
     update_lr_on_opt_step: true
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
index 4a72296d..d2f01bd9 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml
@@ -22,7 +22,7 @@ data:
       num_chunks_per_seg_epoch: 0.1
 
     data_loader:
-      num_workers: 8
+      num_workers: 4
   val:
     dataset:
       aug_cfgs: 
@@ -45,7 +45,7 @@ data:
       weight_exponent: 0.3
       num_chunks_per_seg_epoch: 1.0
     data_loader:
-      num_workers: 8
+      num_workers: 4
 model: 
   hf_feats:
     pretrained_model_path: facebook/wav2vec2-xls-r-300m 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
index 3712babc..39c61fa7 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml
@@ -9,7 +9,7 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 70.
+      max_batch_length: 50.
       min_batch_size: 1
       drop_last: false
       # for class_weighted_random_bucketing_seg_sampler
@@ -30,7 +30,7 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 70.
+      max_batch_length: 50.
       min_batch_size: 1
       drop_last: true
       # for class_weighted_random_bucketing_seg_sampler
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
index f41f8dad..7e059b3b 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml
@@ -70,8 +70,8 @@ trainer:
   lrsched:
     lrsch_type: exp_lr
     decay_rate: 0.5
-    decay_steps: 180000
-    hold_steps: 60000
+    decay_steps: 45000
+    hold_steps: 30000
     min_lr: 4e-5
     warmup_steps: 6000
     update_lr_on_opt_step: true
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
index 9db63d77..e5ae33a4 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
@@ -49,10 +49,11 @@ model:
     decoder:
       prune_range: 15
       override_dropouts: false
+      reduction: mean
 trainer:
   optim:
     opt_type: sgd
-    lr: 0.005
+    lr: 0.0002
     momentum: 0.9
     weight_decay: 4e-4
   lrsched:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
index 77cd2d26..8c62ac1b 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml
@@ -47,7 +47,7 @@ model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml
 trainer:
   optim:
     opt_type: sgd
-    lr: 0.003
+    lr: 0.001
     momentum: 0.9
     weight_decay: 4e-4
   lrsched:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
index c73c7130..a40db186 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
@@ -49,16 +49,16 @@ model:
 trainer:
   optim:
     opt_type: sgd
-    lr: 0.0005
+    lr: 0.0001
     momentum: 0.9
     weight_decay: 4e-4
   lrsched:
     lrsch_type: exp_lr
     decay_rate: 0.5
-    decay_steps: 420000
-    hold_steps: 300000
+    decay_steps: 60000
+    hold_steps: 30000
     min_lr: 4e-5
-    warmup_steps: 15000
+    warmup_steps: 5000
     update_lr_on_opt_step: true
   grad_clip: 100
   use_amp: true
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
index 96e0c4aa..e9fe0b05 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml
@@ -13,7 +13,7 @@ data:
       min_batch_size: 1
       drop_last: false
     data_loader:
-      num_workers: 1
+      num_workers: 2
   val:
     dataset:
       aug_cfgs: 
@@ -28,7 +28,7 @@ data:
       min_batch_size: 1
       drop_last: true
     data_loader:
-      num_workers: 1
+      num_workers: 2
 model: wav2vec2xlsr300m_transducer_do0.4.yaml
 trainer:
   optim:
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
index 69c489b0..2e5a9ea5 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
@@ -32,10 +32,7 @@ data:
 model: 
   transducer:
     decoder:
-      override_dropouts: true
-      embedding_dropout_rate: 0.3
-      rnn_dropout_rate: 0.3
-
+      override_dropouts: false
 trainer:
   optim:
     opt_type: sgd
diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh
index e844d6cd..56b242ed 100644
--- a/egs/commonvoice/v1/datapath.sh
+++ b/egs/commonvoice/v1/datapath.sh
@@ -5,7 +5,7 @@
 
 
 if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
-  commonvoice_root=
+  commonvoice_root=/scratch4/jvillal7/ylu125/corpora/commonvoice
   musan_root=/export/corpora5/JHU/musan
   echo "Put your database paths here"
   exit 1
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh
index c5febd98..9d35d162 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh
@@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v2.1_13_langs
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0022.pth
+nnet_s1=$nnet_s1_dir/model_ep0002.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.1.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
index debd9377..1db9b7a6 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh
@@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v2.2_13_langs
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0014.pth
+nnet_s1=$nnet_s1_dir/model_ep0013.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
index 1fc49fdd..b0ed4451 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
@@ -34,7 +34,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0016.pth
+nnet_s1=$nnet_s1_dir/model_ep0007.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
index 71d38168..ea68b945 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh
@@ -29,7 +29,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s3
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0003.pth
+nnet_s1=$nnet_s1_dir/model_ep0005.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh
index 575a8436..fb6709db 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh
@@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3_13_langs_16000_bpe
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0019.pth
+nnet_s1=$nnet_s1_dir/model_ep0002.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
index 29a762fa..f43b323f 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
@@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0007.pth
+nnet_s1=$nnet_s1_dir/model_ep0016.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
 nnet_s2_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
index c0fbe9dc..d62fcef4 100644
--- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
+++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh
@@ -25,7 +25,7 @@ nnet_name=${hf_model_name}_transducer_v3.3_it
 nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0015.pth
+nnet_s1=$nnet_s1_dir/model_ep0042.pth
 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml
 nnet_s2_args=""
@@ -39,3 +39,4 @@ nnet_s3_name=${nnet_name}.s3
 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
 nnet_s3=$nnet_s3_dir/model_ep0002.pth
 nnet_s3=$nnet_s3_dir/model_ep0005.pth
+
diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh
index 617f03ae..ee14ca2b 100755
--- a/egs/commonvoice/v1/run_004_compute_bpe.sh
+++ b/egs/commonvoice/v1/run_004_compute_bpe.sh
@@ -6,10 +6,8 @@
 set -e
 
 vocab_sizes=(
-  # 5000
-  2000
-  1000
-  500
+  8000
+  16000
 )
 
 dl_dir=$PWD/download
@@ -23,14 +21,14 @@ config_file=default_config.sh
 . $config_file
 
 
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  echo "Stage 1: Dump transcripts for LM training"
-  mkdir -p data/lm
-  gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
-    | jq '.text' \
-    | sed 's:"::g' \
-    > data/lm/${language}_transcript_words.txt
-fi
+# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+#   echo "Stage 1: Dump transcripts for LM training"
+#   mkdir -p data/lm
+#   gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+#     | jq '.text' \
+#     | sed 's:"::g' \
+#     > data/lm/${language}_transcript_words.txt
+# fi
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   echo "Stage 2: Prepare BPE based lang"
@@ -44,16 +42,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
     echo "!SIL 1" >> $lang_dir/words.txt
     echo "<UNK> 2" >> $lang_dir/words.txt
 
-    # Add regular words to words.txt
-    gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
-      | jq '.text' \
-      | sed 's:"::g' \
-      | sed 's: :\n:g' \
-      | sort \
-      | uniq \
-      | sed '/^$/d' \
-      | awk '{print $0,NR+2}' \
-      >> $lang_dir/words.txt
+    # # Add regular words to words.txt
+    # gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \
+    #   | jq '.text' \
+    #   | sed 's:"::g' \
+    #   | sed 's: :\n:g' \
+    #   | sort \
+    #   | uniq \
+    #   | sed '/^$/d' \
+    #   | awk '{print $0,NR+2}' \
+    #   >> $lang_dir/words.txt
 
     # Add remaining special word symbols expected by LM scripts.
     num_words=$(cat $lang_dir/words.txt | wc -l)
diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh
index 284a68f5..55cb04a3 100755
--- a/egs/commonvoice/v1/run_011_train_asr.sh
+++ b/egs/commonvoice/v1/run_011_train_asr.sh
@@ -18,7 +18,7 @@ set -e
 #export CONV_RSH=ssh
 #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH
 
-export CUDA_VISIBLE_DEVICES=0,1
+# export CUDA_VISIBLE_DEVICES=0,1
 stage=1
 ngpu=2
 config_file=default_config.sh
@@ -106,8 +106,8 @@ if [ $stage -le 2 ]; then
     --in-model-file $nnet_s1 \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
-    --master-port 1236 \
     --num-gpus $ngpu
+    # --master-port 1236 \
   
 fi
 
diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh
index fbf30558..638384bb 100755
--- a/egs/commonvoice/v1/run_015_train_film_asr.sh
+++ b/egs/commonvoice/v1/run_015_train_film_asr.sh
@@ -74,7 +74,6 @@ if [ $stage -le 1 ]; then
     --trainer.exp-path $nnet_s1_dir $args \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
-    --in-model-file $nnet_rnn_transducer \
     --master-port 1237 \
     --num-gpus $ngpu
 
@@ -117,26 +116,30 @@ if [ $stage -le 3 ]; then
   if [ "$use_wandb" == "true" ];then
     extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
   fi
-  
 
   mkdir -p $nnet_s3_dir/log
   $cuda_cmd \
     --gpu $ngpu $nnet_s3_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    
-    .py $nnet_type \
+    finetune_wav2vec2rnn_film_transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
     --data.train.dataset.audio-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
+    --data.train.dataset.class-names "language" \
+    --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
     --data.val.dataset.audio-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
+    --data.val.dataset.class-names "language" \
+    --data.val.dataset.class-files $train_dir/langs \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
-    --in-model-file $nnet_s2 \
+    --in-model-file $nnet_s1 \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1237 \
     --num-gpus $ngpu
+  
 fi
 
diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh
index 4b312e76..6a4b3252 100755
--- a/egs/commonvoice/v1/run_020_train_asr_lid.sh
+++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh
@@ -20,7 +20,7 @@ set -e
 
 
 stage=1
-ngpu=4
+ngpu=2
 config_file=default_config.sh
 interactive=false
 num_workers=""
diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
index 8321169f..0678b63c 100755
--- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
+++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
@@ -89,7 +89,7 @@ else
 	cp $data_in/$f $data_out/$f
     fi
 fi
-
+echo $cmd
 $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \
     hyp_utils/conda_env.sh \
     preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \
diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
index 22808dbd..514fe4d1 100755
--- a/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
+++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py
@@ -196,7 +196,7 @@ def train_model(gpu_id, args):
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
-    metrics = {"acc": CategoricalAccuracy()}
+    metrics = {} #{"acc": CategoricalAccuracy()}
     trainer = Trainer(
         model,
         device=device,
diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py
index 7018c406..6965f9f9 100755
--- a/hyperion/bin/train_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/train_wav2vec2rnn_transducer.py
@@ -123,7 +123,7 @@ def train_model(gpu_id, args):
     set_float_cpu("float32")
     #torch.backends.cudnn.deterministic = True
     #torch.backends.cudnn.benchmark = False
-    torch.backends.cudnn.enabled = False
+    # torch.backends.cudnn.enabled = False
 
     ddp_args = ddp.filter_ddp_args(**kwargs)
     device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
@@ -252,5 +252,5 @@ def make_parser(model_class):
 
     args_sc.model_class = model_dict[model_type]
     # torch docs recommend using forkserver
-    # multiprocessing.set_start_method("forkserver")
+    multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
diff --git a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py
index 85689ac3..bafe8f66 100755
--- a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py
+++ b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py
@@ -105,7 +105,7 @@ def init_data(partition, rank, num_gpus, **kwargs):
                                               collate_fn=transducer_language_collate)
     return data_loader
 
-def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs):
+def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs):
     model_args = model_class.filter_args(**kwargs["model"])
     if rank == 0:
         logging.info("model network args={}".format(model_args))
@@ -113,6 +113,7 @@ def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs):
     model_args["transducer"]["decoder"]["blank_id"] = blank_id
     model_args["transducer"]["decoder"]["vocab_size"] = vocab_size
     model_args["languageid"]["num_classes"] = num_classes
+    model_args["loss_class_weight"] = loss_class_weight
     model = model_class(**model_args)
     if rank == 0:
         logging.info("model={}".format(model))
@@ -149,6 +150,7 @@ def train_model(gpu_id, args):
     model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
                        train_loader.dataset.sp.get_piece_size(),
                        list(train_loader.dataset.num_classes.values())[0],
+                       train_loader.batch_sampler.class_info["weights"],
                         **kwargs)
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py
index 2368f1c2..c0264299 100755
--- a/hyperion/bin/train_wav2vec2transducer.py
+++ b/hyperion/bin/train_wav2vec2transducer.py
@@ -27,6 +27,7 @@
 
 model_dict = {
     "hf_wav2vec2transducer": HFWav2Vec2Transducer,
+    "hf_wav2vec2rnn_transducer": HFWav2Vec2Transducer,
 }
 
 
@@ -51,9 +52,12 @@ def transducer_collate(batch):
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
+    logging.getLogger().setLevel(logging.INFO)
     data_kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**data_kwargs["dataset"])
     sampler_args = data_kwargs["sampler"]
+    logging.info("rank={}".format(rank))
+    logging.info("{} audio dataset args={}".format(partition, ad_args))
     if rank == 0:
         logging.info("{} audio dataset args={}".format(partition, ad_args))
         logging.info("{} sampler args={}".format(partition, sampler_args))
@@ -85,6 +89,7 @@ def init_data(partition, rank, num_gpus, **kwargs):
 
 
 def init_model(blank_id, vocab_size, rank, model_class, **kwargs):
+    logging.getLogger().setLevel(logging.INFO)
     model_args = model_class.filter_args(**kwargs["model"])
     if rank == 0:
         logging.info("model network args={}".format(model_args))
@@ -123,6 +128,8 @@ def train_model(gpu_id, args):
                        train_loader.dataset.sp.get_piece_size(), **kwargs)
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
+    logging.info("trainer args={}".format(trn_args))
+    logging.info("rank={}".format(rank))
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
     metrics = {}  #{"acc": CategoricalAccuracy()}
diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py
index 8fe67792..4967a2c5 100644
--- a/hyperion/torch/layers/global_pool.py
+++ b/hyperion/torch/layers/global_pool.py
@@ -781,9 +781,13 @@ def forward(self, x, x_lengths=None, weights=None):
             x = x.transpose(1, self.dim)
 
         # x = (batch, feat_dim, time)
+        # logging.info("x_lengths",x_lengths)
+        # logging.info("weights_bef",weights)
         weights = self._standardize_weights(x, x_lengths, weights)  # (batch, 1,  time)
         x_inner = self.conv1(x)  # (batch, inner_dim, time)
+        # logging.info("weights_aft",weights)
         # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner))))
+        # logging.info('weights shape={} {}'.format(weights.shape, weights.dtype))
         if self.use_global_context:
             global_mus = self.stats_pool(x, weights=weights)
             x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1)
@@ -800,6 +804,7 @@ def forward(self, x, x_lengths=None, weights=None):
                 else:
                     min_value = -1e20
                 mask = weights.eq(0)
+                # logging.info("attn", attn.shape, mask.shape)
                 attn = attn.masked_fill(mask, min_value)
 
             attn = nnf.softmax(attn, dim=-1)
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index d8374e77..fe6cee1d 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -46,9 +46,13 @@ def __init__(self,
                  hf_feats: TorchModel,
                  transducer: Union[Dict, TorchModel],
                  languageid: Union[Dict, TorchModel],
-                 feat_fusion_start: int = 0,
+                 feat_fusion_start_transducer: int = 0,
+                 feat_fusion_start_lid: int = 0,
                  feat_fusion_method_transducer: str = "weighted-avg",
                  feat_fusion_method_lid: str = "weighted-avg",
+                 loss_lid_type: str = "weightedCE",
+                 loss_class_weight: Optional[torch.Tensor] = None,
+                 loss_class_weight_exp= 1.0,
                  loss_weight_transducer: float = 0.005,
                  loss_weight_lid: float = 1.0,
                  lid_length: float = 3.0,
@@ -82,22 +86,34 @@ def __init__(self,
 
         self.transducer = transducer
         self.languageid = languageid
-
-        self.feat_fusion_start = feat_fusion_start
+        self.feat_fusion_start_transducer = feat_fusion_start_transducer
+        self.feat_fusion_start_lid = feat_fusion_start_lid
         self.feat_fusion_method_transducer = feat_fusion_method_transducer
         self.feat_fusion_method_lid = feat_fusion_method_lid
+        self.loss_lid_type = loss_lid_type
+        self.loss_class_weight = loss_class_weight
+        self.loss_class_weight_exp = loss_class_weight_exp
+
+        if loss_lid_type == "CE" or loss_lid_type is None:
+            self.loss_lid = nn.CrossEntropyLoss()
+        elif loss_lid_type == "weightedCE":
+            self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp))
+            logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp))
+        elif loss_lid_type == "focal_loss":
+            self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True)
+
         self.loss_weight_transducer = loss_weight_transducer
         self.loss_weight_lid = loss_weight_lid
         self.lid_length = lid_length
         self._hf_context = contextlib.nullcontext()
-        self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer)
-        self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid)
+        self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer)
+        self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid)
 
-    def _make_fuser(self, method):
+    def _make_fuser(self, method, start):
         if method == "last":
             feat_fuser = None
             return feat_fuser
-        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        num_layers = self.hf_feats.num_encoder_layers + 1 - start
         layer_dim = self.hf_feats.hidden_size
         if method == "weighted-avg":
             feat_fuser = nn.Parameter(torch.zeros(num_layers))
@@ -126,7 +142,7 @@ def _fuse_hid_feats(self, hid_feats):
             # There is only one layer of features
             return hid_feats[0]
 
-        hid_feats = hid_feats[self.feat_fusion_start:]
+        hid_feats = hid_feats[self.feat_fusion_start_transducer:]
         if self.feat_fusion_method_transducer == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_transducer_weights = nn.functional.softmax(self.transducer_fuser, dim=-1)
@@ -257,7 +273,8 @@ def forward(
             return_logits=return_logits,
         )
 
-        loss_lid = nn.CrossEntropyLoss()(logits, languageid)
+        # loss_lid = nn.CrossEntropyLoss()(logits, languageid)
+        loss_lid = self.loss_lid(logits, languageid)
         
         trans_output = self.transducer(
             feats_transducer,
@@ -390,9 +407,13 @@ def filter_args(**kwargs):
         valid_args = (
             "hf_feats",
             "transducer",
-            "feat_fusion_start",
+            "feat_fusion_start_transducer",
+            "feat_fusion_start_lid",
             "feat_fusion_method_transducer",
             "feat_fusion_method_lid",
+            "loss_lid_type",
+            "loss_class_weight",
+            "loss_class_weight_exp",
             "loss_weight_transducer",
             "loss_weight_lid",
             "languageid",
@@ -411,9 +432,13 @@ def get_config(self):
             "hf_feats": hf_cfg,
             "transducer": tran_cfg,
             "languageid": lid_cfg,
-            "feat_fusion_start": self.feat_fusion_start,
+            "feat_fusion_start_transducer": self.feat_fusion_start_transducer,
+            "feat_fusion_start_lid": self.feat_fusion_start_lid,
             "feat_fusion_method_transducer": self.feat_fusion_method_transducer,
             "feat_fusion_method_lid": self.feat_fusion_method_lid,
+            "loss_lid_type": self.loss_lid_type,
+            "loss_class_weight": self.loss_class_weight,
+            "loss_class_weight_exp": self.loss_class_weight_exp,
             "loss_weight_transducer": self.loss_weight_transducer,
             "loss_weight_lid": self.loss_weight_lid,
             "lid_length": self.lid_length,
@@ -422,8 +447,16 @@ def get_config(self):
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    def change_config(self, hf_feats, transducer, languageid):
+    # def change_config(self, hf_feats, transducer, languageid):
+    def change_config(self, loss_weight_transducer, loss_weight_lid, lid_length, hf_feats, transducer, languageid):
         logging.info("changing hf wav2transducer config")
+
+        self.loss_weight_transducer = loss_weight_transducer
+        self.loss_weight_lid = loss_weight_lid
+        self.lid_length = lid_length
+        self.loss_reg_weight_transducer = loss_reg_weight_transducer
+        self.loss_reg_weight_lid = loss_reg_weight_lid
+        
         self.hf_feats.change_config(**hf_feats)
         self.transducer.change_config(**transducer)
         self.languageid.change_config(**languageid)
@@ -436,14 +469,24 @@ def add_class_args(parser, prefix=None, skip=set()):
             parser = ArgumentParser(prog="")
 
         parser.add_argument(
-            "--feat-fusion-start",
+            "--feat-fusion-start-transducer",
+            default=0,
+            type=int,
+            help="""
+            the input to transducer model will fuse the wav2vec 
+            layers from feat_fusion_start_transducer to
+            the wav2vec num_layers""",
+        )
+        parser.add_argument(
+            "--feat-fusion-start-lid",
             default=0,
             type=int,
             help="""
-            the input to x-vector model will fuse the wav2vec 
-            layers from feat_fusion_start to
+            the input to lid model will fuse the wav2vec 
+            layers from feat_fusion_start_lid to
             the wav2vec num_layers""",
         )
+
         parser.add_argument(
             "--feat-fusion-method-transducer",
             default="weighted-avg",
@@ -459,6 +502,24 @@ def add_class_args(parser, prefix=None, skip=set()):
                   "in [weighted-avg, linear, cat, last]"),
         )
 
+        parser.add_argument(
+            "--loss-lid-type",
+            default="weightedCE",
+            choices=["CE", "weightedCE", "focal_loss"],
+            help=("loss type for language identification"),
+        )
+        parser.add_argument(
+            "--loss-class-weight",
+            default=None,
+            type=str,
+            help=("class weight for language identification"),
+        )
+        parser.add_argument(
+            "--loss-class-weight-exp",
+            default=1.0,
+            type=float,
+            help=("class weight exponent for language identification"),
+        )
         parser.add_argument(
             "--loss-weight-transducer",
             default=0.005,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
index 4a8ca173..28d51679 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py
@@ -38,9 +38,13 @@ def __init__(
         hf_feats: Union[Dict, HFWav2Vec2],
         transducer: Union[Dict, RNNTransducer],
         languageid: Union[Dict, ResNet1dLanguageID],
-        feat_fusion_start: int = 0,
+        feat_fusion_start_transducer: int = 0,
+        feat_fusion_start_lid: int = 0,
         feat_fusion_method_transducer: str = "weighted-avg",
         feat_fusion_method_lid: str = "weighted-avg",
+        loss_lid_type: str = "weightedCE",
+        loss_class_weight: Optional[torch.Tensor] = None,
+        loss_class_weight_exp: float = 1.0,
         loss_weight_transducer: float = 0.005,
         loss_weight_lid: float = 1.0,
         lid_length: float = 3.0,
@@ -67,8 +71,17 @@ def __init__(
         # languageid = wav2languageid.languageid
 
 
-        super().__init__(hf_feats, transducer, languageid, feat_fusion_start,
-                         feat_fusion_method_transducer, feat_fusion_method_lid, loss_weight_transducer, loss_weight_lid, lid_length)
+        super().__init__(hf_feats, transducer, languageid, 
+                        feat_fusion_start_transducer=feat_fusion_start_transducer,
+                        feat_fusion_start_lid=feat_fusion_start_lid,
+                        feat_fusion_method_transducer=feat_fusion_method_transducer,
+                        feat_fusion_method_lid=feat_fusion_method_lid,
+                        loss_lid_type=loss_lid_type,
+                        loss_class_weight=loss_class_weight,
+                        loss_class_weight_exp=loss_class_weight_exp,
+                        loss_weight_transducer=loss_weight_transducer,
+                        loss_weight_lid=loss_weight_lid,
+                        lid_length=lid_length)
 
     @staticmethod
     def filter_args(**kwargs):
@@ -121,6 +134,22 @@ def add_finetune_args(parser, prefix=None):
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
+        parser.add_argument(
+            "--loss-lid-type",
+            default="weightedCE",
+            type=str,
+            help="""
+            The type of the loss for language id
+            """,
+        )
+        parser.add_argument(
+            "--loss-class-weight-exp",
+            default=1.0,
+            type=float,
+            help="""
+            The exponent of the class weight for language id
+            """,
+        )
 
         parser.add_argument(
             "--loss-weight-transducer",

From 47fae72f38076d7278ba83ef1651f2c446e7d2af Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-71-145.ec2.internal>
Date: Wed, 5 Jul 2023 22:00:52 +0000
Subject: [PATCH 68/89] merge commit

---
 egs/commonvoice/v1/cmd.sh                       |  7 +++++--
 ...ec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml |  5 +++--
 ...ec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml |  7 ++++---
 ...ec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml |  6 +++---
 ...ec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml |  2 +-
 ...2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml |  2 +-
 ...wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml | 12 ++++++------
 ...vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml |  6 +++---
 ...sducer_ecapadnn512x3_1layer_stage2_v2.1.yaml |  6 ++++--
 ...sducer_ecapadnn512x3_1layer_stage2_v2.2.yaml |  4 ++--
 ...2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml |  1 +
 ...lsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml |  2 +-
 .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml   |  1 +
 .../v1/global_conf/config_lid_v4.1_13langs.sh   |  2 +-
 .../v1/global_conf/config_lid_v4.2_13langs.sh   |  7 +++----
 ...fig_pruned_filmed_transducer_v1.0_13langs.sh |  2 +-
 ...fig_pruned_filmed_transducer_v5.1_13langs.sh |  2 +-
 .../config_pruned_transducer_v4.0_13langs.sh    |  4 ++--
 .../v1/local/initailize_lid_model.py            |  4 +++-
 egs/commonvoice/v1/run_011_train_asr.sh         |  1 +
 egs/commonvoice/v1/run_012_train_lid.sh         | 17 ++++++++++-------
 egs/commonvoice/v1/run_015_train_film_asr.sh    | 12 ++++++------
 egs/commonvoice/v1/run_020_train_asr_lid.sh     | 12 ++++++------
 .../v1/run_025_train_film_asr_lid.sh            |  3 +--
 egs/commonvoice/v1/run_030_inference.sh         |  2 +-
 egs/commonvoice/v1/run_031_inference_film.sh    |  2 +-
 hyperion/bin/finetune_wav2vec2rnn_transducer.py |  5 ++++-
 hyperion/bin/train_wav2vec2xvector.py           |  4 ++++
 .../hf_wav2rnn_film_transducer.py               |  3 +++
 .../hf_wav2rnn_film_transducer_languageid.py    | 14 ++++++++------
 .../hf_wav2rnn_transducer_languageid.py         |  2 ++
 hyperion/torch/models/xvectors/xvector.py       | 15 +++++++++++++++
 .../torch/narchs/rnn_film_transducer_decoder.py | 12 ++++++++----
 33 files changed, 116 insertions(+), 70 deletions(-)

diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh
index 6606a180..cedd70f9 100755
--- a/egs/commonvoice/v1/cmd.sh
+++ b/egs/commonvoice/v1/cmd.sh
@@ -23,8 +23,11 @@ elif [ "$(hostname -d)" == "rockfish.cluster" ];then
     export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G"
     export cuda_eval_cmd="$train_cmd"
 else
-    export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " 
-    export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G"
+    export train_cmd="run.pl" 
+    export cuda_cmd="run.pl"
     export cuda_eval_cmd="$train_cmd"
+    #export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " 
+    #export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G"
+    #export cuda_eval_cmd="$train_cmd"
 fi
 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
index 7d3d133e..15e06f93 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
@@ -10,7 +10,7 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 50
+      max_batch_length: 40
       max_audio_length: 15.
       min_batch_size: 1
       drop_last: false
@@ -34,7 +34,7 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 50
+      max_batch_length: 40
       max_audio_length: 15.
       min_batch_size: 1
       drop_last: true
@@ -62,6 +62,7 @@ model:
         rnn_dropout_rate: 0.4
         rnn_type: lstm
       joiner:
+        joiner_type: film_joiner
         hid_feats: 512
   feat_fusion_method: film-weighted-avg
   feat_fusion_start: 2
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
index 5a1555dd..b391f50c 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
@@ -22,7 +22,7 @@ data:
       num_chunks_per_seg_epoch: 0.3
 
     data_loader:
-      num_workers: 1
+      num_workers: 8
   val:
     dataset:
       aug_cfgs: 
@@ -45,12 +45,13 @@ data:
       weight_exponent: 0.3
       num_chunks_per_seg_epoch: 1.0
     data_loader:
-      num_workers: 1
+      num_workers: 8
 model: 
   transducer:
     decoder:
       prune_range: 15
-      override_dropouts: false
+      joiner:
+        joiner_type: film_joiner
 trainer:
   optim:
     opt_type: sgd
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
index 72a4c6a6..208a094c 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml
@@ -19,10 +19,10 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 0.05
 
     data_loader:
-      num_workers: 8
+      num_workers: 4
   val:
     dataset:
       aug_cfgs: 
@@ -45,7 +45,7 @@ data:
       weight_exponent: 0.3
       num_chunks_per_seg_epoch: 1.0
     data_loader:
-      num_workers: 8
+      num_workers: 4
 model: 
   hf_feats:
     pretrained_model_path: facebook/wav2vec2-xls-r-300m 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
index 8947cfd0..6d7317f7 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml
@@ -77,7 +77,7 @@ trainer:
     lrsch_type: exp_lr
     decay_rate: 0.8
     decay_steps: 45000
-    hold_steps: 90000
+    hold_steps: 40000
     min_lr: 4e-5
     warmup_steps: 3000
     update_lr_on_opt_step: true
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
index aaf5dedb..7a5b5dd1 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml
@@ -19,7 +19,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 0.05
 
     data_loader:
       num_workers: 8
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
index e5ae33a4..2833099f 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
@@ -9,8 +9,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 40.
-      max_audio_length: 20.
+      max_batch_length: 15.
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: false
       # for class_weighted_random_bucketing_seg_sampler
@@ -18,7 +18,7 @@ data:
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.3
+      num_chunks_per_seg_epoch: 0.05
 
     data_loader:
       num_workers: 1
@@ -32,8 +32,8 @@ data:
     sampler:
       #sampler_type: 'bucketing_seg_sampler'
       sampler_type: 'class_weighted_random_bucketing_seg_sampler'
-      max_batch_length: 40.
-      max_audio_length: 20.
+      max_batch_length: 15.
+      max_audio_length: 15.
       min_batch_size: 1
       drop_last: true
       # for class_weighted_random_bucketing_seg_sampler
@@ -71,4 +71,4 @@ trainer:
   # eff_batch_size: 1024
   eff_batch_size: 128
   train_mode: full
- 
\ No newline at end of file
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
index d270d62c..221698d0 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml
@@ -9,14 +9,14 @@ data:
       wav_scale: 1
     sampler:
       sampler_type: 'class_weighted_random_seg_chunk_sampler'
-      min_batch_size: 32
+      min_batch_size: 24 
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       # weighted
       weight_mode: "data-prior"
       class_name: "language"
       weight_exponent: 0.3
-      num_chunks_per_seg_epoch: 0.1
+      num_chunks_per_seg_epoch: 0.2
     data_loader:
       num_workers: 8
   val:
@@ -29,7 +29,7 @@ data:
       wav_scale: 1
     sampler:
       sampler_type: 'class_weighted_random_seg_chunk_sampler'
-      min_batch_size: 32
+      min_batch_size: 24 
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       # weighted
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
index 716a9d8f..4d6b8bed 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml
@@ -59,8 +59,10 @@ model:
   # loss_class_weight_exp: 1.0 # 0~1
 
   loss_weight_transducer: 1.0
-  loss_weight_lid: 10.0
-  loss_weight_embed: 10
+  loss_weight_lid: 20.0
+  loss_weight_embed: 20
+  loss_reg_weight_transducer: 0.0
+  loss_reg_weight_lid: 10.0
   # lid_length: 3.0
   
   # feat_fusion_method_transducer: film-fused-feature
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
index 2f625da0..4197c653 100644
--- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml
@@ -61,8 +61,8 @@ model:
   loss_weight_transducer: 1.0
   loss_weight_lid: 0.0
   loss_weight_embed: 10
-  loss_reg_weight_transducer: 0.5
-  loss_reg_weight_lid: 0.0
+  loss_reg_weight_transducer: 0.0
+  loss_reg_weight_lid: 1.0
   # lid_length: 3.0
   
   # feat_fusion_method_transducer: film-fused-feature
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
index 27132c2d..a647c80b 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml
@@ -28,6 +28,7 @@ languageid:
     multilayer_concat: true
     endpoint_channels: 3072
     hid_act: swish
+    dropout_rate: 0.1
   pool_net:
     pool_type: ch-wise-att-mean+stddev
     inner_feats: 128
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
index 63c914e3..803dc396 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml
@@ -32,7 +32,7 @@ languageid:
     pool_type: ch-wise-att-mean+stddev
     inner_feats: 128
   embed_dim: 128
-  loss_type: subcenter-arc-softmax
+  loss_type: arc-softmax
   num_subcenters: 2
   cos_scale: 32.0
   margin: 0.
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
index 5ca98bd9..86d1e7c0 100644
--- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml
@@ -27,6 +27,7 @@ languageid:
     multilayer: true
     multilayer_concat: true
     endpoint_channels: 1536
+    dropout_rate: 0.1
   pool_net:
     pool_type: ch-wise-att-mean+stddev
     inner_feats: 128
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh
index 7d0ed120..9b398388 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh
@@ -29,7 +29,7 @@ nnet_s1_name=$nnet_name.s1
 
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
 nnet_s1=$nnet_s1_dir/model_ep0014.pth
-
+ 
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.1.yaml
 nnet_s2_args=""
 nnet_s2_name=${hf_model_name}_resnet1d_v4.1_13_langs.s2
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh
index 49721635..1989a904 100644
--- a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh
@@ -23,15 +23,14 @@ nnet_type=hf_wav2vec2resnet1d
 
 nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml
 nnet_s1_args=""
-
 nnet_name=${hf_model_name}_resnet1d_v4.2_13_langs
 nnet_s1_name=$nnet_name.s1
-
 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
-nnet_s1=$nnet_s1_dir/model_ep0008.pth
+nnet_s1=$nnet_s1_dir/model_ep0003.pth
+
 nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.2.yaml
 nnet_s2_args=""
-nnet_s2_name=${hf_model_name}_resnet1d_v2_13_langs.s2
+nnet_s2_name=${hf_model_name}_resnet1d_v4.2_13_langs.s2
 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
 nnet_s2=$nnet_s2_dir/model_ep0020.pth
 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
index b0ed4451..aca7859c 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh
@@ -40,7 +40,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0020.pth
+nnet_s2=$nnet_s2_dir/model_ep0003.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
index ab3d1ec8..951be9e0 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
index f43b323f..43bc7282 100644
--- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh
@@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
 # x-vector training 
 nnet_data=13_langs_train_proc_audio
 dev_data=13_langs_dev_proc_audio
-test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio de_test_proc_audio fr_test_proc_audio en_test_proc_audio kab_test_proc_audio it_test_proc_audio" 
 
 lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
 language=13_langs_weighted
@@ -34,7 +34,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
 nnet_s2_args=""
 nnet_s2_name=${nnet_name}.s2
 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
-nnet_s2=$nnet_s2_dir/model_ep0020.pth
+nnet_s2=$nnet_s2_dir/model_ep0014.pth
 
 nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
 nnet_s3_args=""
diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py
index 22e32bed..1862333c 100644
--- a/egs/commonvoice/v1/local/initailize_lid_model.py
+++ b/egs/commonvoice/v1/local/initailize_lid_model.py
@@ -17,7 +17,9 @@ def copy_model_parameters(ASR_model, LID_model):
     ASR_state_dict = ASR_model["model_state_dict"]
     LID_state_dict = LID_model["model_state_dict"]
     
-    #ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} 
+    # LID_state_dict = {name.replace("module.", ""): param for name, param in LID_state_dict.items()} 
+
+    # ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} 
 
     update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name}
     # remove feature fuser
diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh
index 55cb04a3..b6a50e7f 100755
--- a/egs/commonvoice/v1/run_011_train_asr.sh
+++ b/egs/commonvoice/v1/run_011_train_asr.sh
@@ -75,6 +75,7 @@ if [ $stage -le 1 ]; then
     --trainer.exp-path $nnet_s1_dir $args \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1236 \
     --num-gpus $ngpu
 
 fi
diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh
index 3b250e16..bf14500e 100755
--- a/egs/commonvoice/v1/run_012_train_lid.sh
+++ b/egs/commonvoice/v1/run_012_train_lid.sh
@@ -7,8 +7,10 @@
 . ./path.sh
 set -e
 
+# export CUDA_VISIBLE_DEVICES=3
+
 stage=1
-ngpu=2
+ngpu=4
 config_file=default_config.sh
 interactive=false
 num_workers=""
@@ -48,19 +50,20 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s1_dir $args \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
+    --master-port 1234 \
     --num-gpus $ngpu
 
 fi
@@ -77,13 +80,13 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.text-file $val_dir/text \
@@ -108,13 +111,13 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2languageid.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $val_dir/langs \
diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh
index 638384bb..e86cf62d 100755
--- a/egs/commonvoice/v1/run_015_train_film_asr.sh
+++ b/egs/commonvoice/v1/run_015_train_film_asr.sh
@@ -60,13 +60,13 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2rnn_film_transducer.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -91,13 +91,13 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2rnn_film_transducer.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -123,13 +123,13 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2rnn_film_transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh
index 6a4b3252..a2422eb5 100755
--- a/egs/commonvoice/v1/run_020_train_asr_lid.sh
+++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh
@@ -61,13 +61,13 @@ if [ $stage -le 1 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \
     train_wav2vec2rnn_transducer_languageid.py $nnet_type \
     --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -92,13 +92,13 @@ if [ $stage -le 2 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer_languageid.py $nnet_type \
     --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.class-names "language" \
     --data.train.dataset.class-files $train_dir/langs \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.class-names "language" \
     --data.val.dataset.class-files $train_dir/langs \
@@ -125,11 +125,11 @@ if [ $stage -le 3 ]; then
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
     finetune_wav2vec2transducer.py $nnet_type \
     --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
-    --data.train.dataset.audio-file $train_dir/wav.scp \
+    --data.train.dataset.recordings-file $train_dir/wav.scp \
     --data.train.dataset.segments-file $train_dir/utt2seg.csv \
     --data.train.dataset.bpe-model $bpe_model \
     --data.train.dataset.text-file $train_dir/text \
-    --data.val.dataset.audio-file $val_dir/wav.scp \
+    --data.val.dataset.recordings-file $val_dir/wav.scp \
     --data.val.dataset.segments-file $val_dir/utt2seg.csv \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s3_dir $args \
diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
index f5976ee1..9058ee5a 100755
--- a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
+++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh
@@ -104,8 +104,7 @@ if [ $stage -le 2 ]; then
     --data.val.dataset.class-files $train_dir/langs \
     --data.val.dataset.text-file $val_dir/text \
     --trainer.exp-path $nnet_s2_dir $args \
-    --in-model-transducer $nnet_transducer \
-    --in-model-lid $nnet_lid \
+    --in-model-file $nnet_s1 \
     --data.train.dataset.time-durs-file $train_dir/utt2dur \
     --data.val.dataset.time-durs-file $val_dir/utt2dur \
     --num-gpus $ngpu
diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh
index ec5b140b..9c5eaaa4 100755
--- a/egs/commonvoice/v1/run_030_inference.sh
+++ b/egs/commonvoice/v1/run_030_inference.sh
@@ -40,7 +40,7 @@ transducer_dir=exp/transducer/$nnet_name
 # Extracts x-vectors for evaluation
 for name in $test_data
 do
-  nj=40
+  nj=20
   steps_transducer/decode_wav2vec2rnn_transducer.sh \
       --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
       $nnet data/$name \
diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh
index 7b796107..d8af0e1b 100755
--- a/egs/commonvoice/v1/run_031_inference_film.sh
+++ b/egs/commonvoice/v1/run_031_inference_film.sh
@@ -40,7 +40,7 @@ transducer_dir=exp/transducer/$nnet_name
 # Extracts x-vectors for evaluation
 for name in $test_data
 do
-  nj=40
+  nj=16
   steps_transducer/decode_wav2vec2rnn_film_transducer.sh \
       --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
       $nnet data/$name \
diff --git a/hyperion/bin/finetune_wav2vec2rnn_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_transducer.py
index 4092ecd7..64d352e0 100755
--- a/hyperion/bin/finetune_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/finetune_wav2vec2rnn_transducer.py
@@ -18,7 +18,6 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
-from hyperion.torch.metrics import CategoricalAccuracy
 from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer,
                                    HFWav2Vec2RNNTransducer)
 from hyperion.torch.trainers import TransducerTrainer as Trainer
@@ -27,6 +26,10 @@
                           namespace_to_dict)
 from torch.nn.utils.rnn import pad_sequence
 
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
 model_dict = {
     "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
     "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer,
diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py
index 8e1653b1..1be2b456 100755
--- a/hyperion/bin/train_wav2vec2xvector.py
+++ b/hyperion/bin/train_wav2vec2xvector.py
@@ -25,6 +25,10 @@
 from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
                           namespace_to_dict)
 
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
     "hf_hubert2resnet1d": HFHubert2ResNet1dXVector,
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 84f2239c..77579c94 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -246,6 +246,9 @@ def unfreeze_film(self):
             if "film" in name:
                 logging.info(f"unfreezing {name}")
                 param.requires_grad = True
+            if "lang_embedding" in name:
+                logging.info(f"unfreezing {name}")
+                param.requires_grad = True
 
     def freeze_feat_fuser(self):
         if self.feat_fuser is None:
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
index 7daeddcb..7ee44b01 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -339,13 +339,13 @@ def forward(
                 if i in return_feat_layers
             ]
 
-        loss_reg_lid = 0
-        if self.loss_reg_weight_lid > 0:
-            loss_reg_lid = self.languageid.get_regularization_loss()
+        # loss_reg_lid = 0
+        # if self.loss_reg_weight_lid > 0:
+        loss_reg_lid = self.languageid.get_regularization_loss()
             
-        loss_reg_transducer = 0
-        if self.loss_reg_weight_transducer > 0:
-            loss_reg_transducer = self.transducer.get_regularization_loss()
+        # loss_reg_transducer = 0
+        # if self.loss_reg_weight_transducer > 0:
+        loss_reg_transducer = self.transducer.get_regularization_loss()
 
 
 
@@ -353,6 +353,8 @@ def forward(
                                                 loss_transducer=trans_output.loss, 
                                                 loss_lid=loss_lid,
                                                 loss_embed=loss_embed,
+                                                loss_reg_lid=loss_reg_lid,
+                                                loss_reg_transducer=loss_reg_transducer,
                                                 loss_transducer_simple=trans_output.loss_simple, 
                                                 loss_transducer_pruned=trans_output.loss_pruned,
                                                 h_feats=trans_output.h_feats,
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index fe6cee1d..6b608368 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -23,6 +23,8 @@ class RNNTransducerLanguageIDOutput(HypDataClass):
     loss_transducer: torch.Tensor  # Loss from the transducer
     loss_lid: torch.Tensor  # Loss from the language ID
     loss_embed: Optional[torch.Tensor] = None  # Loss from the embedding
+    loss_reg_lid: Optional[torch.Tensor] = None  # Regularization loss from the language ID
+    loss_reg_transducer: Optional[torch.Tensor] = None  # Regularization loss from the transducer
     loss_transducer_simple: Optional[torch.Tensor] = None  # Simple loss from the transducer, if available
     loss_transducer_pruned: Optional[torch.Tensor] = None  # Pruned loss from the transducer, if available
     h_feats: Optional[List[torch.Tensor]] = None  # Hidden features, if available
diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py
index d67785d2..04895971 100644
--- a/hyperion/torch/models/xvectors/xvector.py
+++ b/hyperion/torch/models/xvectors/xvector.py
@@ -558,6 +558,21 @@ def get_config(self):
         base_config = super().get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
+
+    def get_regularization_loss(self):
+        reg_loss = 0.0
+        total_params = 0
+
+        for param in self.parameters():
+            reg_loss += torch.norm(param)**2
+            total_params += torch.numel(param)
+
+        reg_loss = (reg_loss) / total_params
+
+        return reg_loss
+
+
+
     @classmethod
     def load(cls, file_path=None, cfg=None, state_dict=None):
         cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict)
diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 9d030ae7..25f0c5f6 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -309,10 +309,14 @@ def forward(
         self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang_embedding: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         # embed lang
-        if self.film_cond_type == ["one-hot"]:
+        # logging.info(f"lang_embedding.shape: {lang_embedding.shape}")
+        # import pdb; pdb.set_trace()
+        if self.film_cond_type == "one-hot":
             lang_embedding = self.lang_embedding(lang_embedding)
-        elif self.film_cond_type == ["lid_pred"]:
+        elif self.film_cond_type == "lid_pred":
             lang_embedding = self.lid_lang_embedding(lang_embedding)
+        # logging.info(f"lang_embedding.shape: {lang_embedding.shape}")
+        # logging.info(f"film_cond_type: {self.film_cond_type}")
         # get y_lengths
         row_splits = y.shape.row_splits(1)
         y_lengths = row_splits[1:] - row_splits[:-1]
@@ -348,9 +352,9 @@ def decode(self,
         # if self.film_cond_type in ["one-hot", "lid_pred"]:
         #     lang_embedding = self.lang_embedding(lang)
 
-        if self.film_cond_type == ["one-hot"]:
+        if self.film_cond_type == "one-hot":
             lang_embedding = self.lang_embedding(lang)
-        elif self.film_cond_type == ["lid_pred"]:
+        elif self.film_cond_type == "lid_pred":
             lang_embedding = self.lid_lang_embedding(lang)
         if method == "time_sync_beam_search":
             return self.decode_time_sync_beam_search(x,

From 562498f69dca3cfab24a8ee452a1e86c58ee85c0 Mon Sep 17 00:00:00 2001
From: neillu23 <neilyenjulu@gmail.com>
Date: Wed, 5 Jul 2023 18:02:26 -0400
Subject: [PATCH 69/89] update decode code

---
 .../hf_wav2rnn_film_transducer_languageid.py  | 27 ++++++++++++++-----
 .../hf_wav2rnn_transducer_languageid.py       |  2 +-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
index 7daeddcb..9697d32c 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py
@@ -381,29 +381,42 @@ def infer(self,
 
 
         feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats(
-            x, x_lengths, return_feat_layers)
+            x, x_lengths, None)
+        # logging.info(f"feat_lengths: {feat_lengths}")
+        # logging.info(f"feats_languageid.shape: {feats_languageid.shape}")
+        # logging.info(f"feats_languageid: {feats_languageid}")
 
 
-        lid = self.languageid(
-            feats_languageid.float(),
-            feat_lengths,
+        output = self.languageid(
+            feats_languageid,
+            None,
             None,
             return_enc_layers=None,
-            return_classif_layers=None,
+            return_classif_layers=[0],
             return_logits=True,
         )
+
+        # output = self.languageid(
+        #     feats_languageid,
+        #     feat_lengths,
+        #     None,
+        #     return_enc_layers=None,
+        #     return_classif_layers=[0],
+        #     return_logits=True,
+        # )
         
-        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid) # (N, T, C)
+        feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0])  # (N, T, C)
             
 
         text = self.transducer.infer(feats_transducer,
                                   feat_lengths,
+                                  lang=output["h_classif"][0],
                                   decoding_method=decoding_method,
                                   beam_width=beam_width,
                                   max_sym_per_frame=max_sym_per_frame,
                                   max_sym_per_utt=max_sym_per_utt)
                                   
-        return text, lid
+        return text, output["logits"]
 
     def unfreeze_lid_film(self):
         for name, param in self.named_parameters():
diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
index 952cbb65..278b09ad 100644
--- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
+++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py
@@ -321,7 +321,7 @@ def infer(self,
         # logging.info(f"feats_languageid: {feats_languageid}")
         lid = self.languageid(
             feats_languageid.float(),
-            feat_lengths,
+            None,
             None,
             return_enc_layers=None,
             return_classif_layers=None,

From 458e65ed9918fe082aabf0f9fb59a29d394addb6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-71-145.ec2.internal>
Date: Fri, 7 Jul 2023 03:14:36 +0000
Subject: [PATCH 70/89] add rnn_original for film-rnn

---
 .../narchs/rnn_film_transducer_decoder.py     | 58 ++++++++++++++-----
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
index 94fe4b17..17bbe515 100644
--- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py
+++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py
@@ -22,7 +22,8 @@
 from ...utils.text import add_sos
 from ..layer_blocks import TransducerFiLMJoiner as FiLMJoiner
 from ..layer_blocks import TransducerJoiner as Joiner
-from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor
+from ..layer_blocks import TransducerRNNFiLMPredictor as FiLMRNNPredictor
+from ..layer_blocks import TransducerRNNPredictor as RNNPredictor
 from .net_arch import NetArch
 
 
@@ -125,9 +126,13 @@ def _make_predictor(self):
         self.predictor_args["condition_size"] = self.condition_size
         # Add FiLM args to the predictor args
         if pred_type == "rnn":
+            pred_args = filter_func_args(FiLMRNNPredictor.__init__,
+                                         self.predictor_args)
+            self.predictor = FiLMRNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type)
+        elif pred_type == "rnn_original":
             pred_args = filter_func_args(RNNPredictor.__init__,
                                          self.predictor_args)
-            self.predictor = RNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type)
+            self.predictor = RNNPredictor(**pred_args)
         # elif pred_type == "conv":
         #     pred_args = filter_func_args(ConvPredictor.__init__,
         #                                  self.predictor_args)
@@ -326,7 +331,10 @@ def forward(
         sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id)
         sos_y_padded = sos_y_padded.to(torch.int64)
         # apply predictor and joiner
-        pred_out, _ = self.predictor(sos_y_padded, lang_embedding)
+        if self.predictor_args["pred_type"] == "rnn":
+            pred_out, _ = self.predictor(sos_y_padded, lang_embedding)
+        elif self.predictor_args["pred_type"] == "rnn_original":
+            pred_out, _ = self.predictor(sos_y_padded)
         loss_simple = loss_pruned = None
         if self.rnnt_loss == "k2_pruned":
             loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned(
@@ -399,7 +407,10 @@ def decode_greedy(self,
 
         sos = torch.tensor([blank_id], device=device,
                            dtype=torch.int64).reshape(1, 1)
-        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        if self.predictor_args["pred_type"] == "rnn":
+            pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        elif self.predictor_args["pred_type"] == "rnn_original":
+            pred_out, (h, c) = self.predictor(sos)
         T = x.size(1)
         t = 0
         hyp = []
@@ -422,8 +433,11 @@ def decode_greedy(self,
             if y != blank_id:
                 hyp.append(y.item())
                 y = y.reshape(1, 1)
-                pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c))
-
+                if self.predictor_args["pred_type"] == "rnn":
+                    pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c))
+                elif self.predictor_args["pred_type"] == "rnn_original":
+                    pred_out, (h, c) = self.predictor(y, (h, c))
+                    
                 sym_per_utt += 1
                 sym_per_frame += 1
 
@@ -445,7 +459,10 @@ def decode_time_sync_beam_search(self,
         device = x.device
 
         sos = torch.tensor([blank_id], device=device).reshape(1, 1)
-        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        if self.predictor_args["pred_type"] == "rnn":
+            pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        elif self.predictor_args["pred_type"] == "rnn_original":
+            pred_out, state = self.predictor(sos)
         T = x.size(1)
         t = 0
         B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)]
@@ -472,11 +489,20 @@ def decode_time_sync_beam_search(self,
                     pred_in = torch.tensor([y_star.ys[-1]],
                                            device=device).reshape(1, 1)
 
-                    pred_out, pred_state = self.predictor(
+                    # pred_out, pred_state = self.predictor(
+                    #     pred_in,
+                    #     lang_embedding,
+                    #     y_star.pred_state,
+                    # )
+                    if self.predictor_args["pred_type"] == "rnn":
+                        pred_out, pred_state = self.predictor(
                         pred_in,
                         lang_embedding,
                         y_star.pred_state,
-                    )
+                        )
+                    elif self.predictor_args["pred_type"] == "rnn_original":
+                        pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,)
+                        
                     cache[cached_key] = (pred_out, pred_state)
                 else:
                     pred_out, pred_state = cache[cached_key]
@@ -572,7 +598,11 @@ def decode_align_length_sync_beam_search(
         device = x.device
 
         sos = torch.tensor([blank_id], device=device).reshape(1, 1)
-        pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        if self.predictor_args["pred_type"] == "rnn":
+            pred_out, (h, c) = self.predictor(sos, lang_embedding)
+        elif self.predictor_args["pred_type"] == "rnn_original":
+            pred_out, state = self.predictor(sos)
+
         T = x.size(1)
         #t = 0
         B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)]
@@ -602,12 +632,14 @@ def decode_align_length_sync_beam_search(
                 if cached_key not in cache:
                     pred_in = torch.tensor([y_star.ys[-1]],
                                            device=device).reshape(1, 1)
-
-                    pred_out, pred_state = self.predictor(
+                    if self.predictor_args["pred_type"] == "rnn":
+                        pred_out, pred_state = self.predictor(
                         pred_in,
                         lang_embedding,
                         y_star.pred_state,
-                    )
+                        )
+                    elif self.predictor_args["pred_type"] == "rnn_original":
+                        pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,)
                     cache[cached_key] = (pred_out, pred_state)
                 else:
                     pred_out, pred_state = cache[cached_key]

From c1d193abd8161a35017d316382b6025ef2c22db0 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Wed, 19 Jul 2023 16:36:06 -0400
Subject: [PATCH 71/89] finished experiments of models 2.0 in voxceleb/v2

---
 egs/voxceleb/v1.2/run_001_prepare_data.sh     | 18 +----
 egs/voxceleb/v2/README.md                     | 60 +++++++++++++++
 ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++
 ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++
 ...baseplus_ecapatdnn512x3_stage1_v2.0_0.yaml | 59 +++++++++++++++
 ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml |  2 +-
 ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml |  2 +-
 ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++
 ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++
 ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++
 ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++
 ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++
 ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++
 ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 45 ++++++++++++
 .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 44 +++++++++++
 .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml    | 45 ++++++++++++
 .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml  | 44 +++++++++++
 ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++
 ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++
 ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++
 ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++
 .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh  | 54 ++++++++++++++
 hyp_utils/create_data_split_dirs.sh           |  4 +-
 hyp_utils/create_data_split_links.sh          |  6 +-
 hyp_utils/feats/make_evad.sh                  |  2 +-
 hyperion/bin/compute_energy_vad.py            | 37 ++++++++--
 hyperion/io/ark_data_writer.py                | 20 +++--
 hyperion/io/audio_reader.py                   | 12 ++-
 hyperion/io/data_rw_factory.py                |  8 +-
 hyperion/io/data_writer.py                    | 36 ++++++++-
 hyperion/io/h5_data_writer.py                 | 19 +++--
 hyperion/io/rw_specifiers.py                  | 47 ++++++++----
 hyperion/utils/__init__.py                    |  1 +
 34 files changed, 1281 insertions(+), 67 deletions(-)
 create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh

diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh
index c151e270..aef70e96 100755
--- a/egs/voxceleb/v1.2/run_001_prepare_data.sh
+++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh
@@ -23,34 +23,24 @@ fi
 
 if [ $stage -le 2 ];then
   # prepare voxceleb1 for test
-  # hyp_utils/conda_env.sh 
   prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \
 		  --use-kaldi-ids \
 		  --output-dir data/voxceleb1_test
-  #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
 fi
 
 if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
   prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \
 		  --vox1-corpus-dir $voxceleb1_root \
 		  --output-dir data/voxsrc22_dev
-  # local/prepare_voxsrc22_dev.py \
-  #   --vox1-corpus-dir $voxceleb1_root \
-  #   --voxsrc22-corpus-dir $voxsrc22_root \
-  #   --output-dir data/voxsrc22_dev
-  prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \
-		  --vox1-corpus-dir $voxceleb1_root \
-		  --output-dir data/voxsrc22_test
 fi
 
 # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
-#   local/prepare_voxsrc22_test.py \
-#     --corpus-dir $voxsrc22_root \
-#     --output-dir data/voxsrc22_test
+#   prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \
+# 		  --vox1-corpus-dir $voxceleb1_root \
+# 		  --output-dir data/voxsrc22_test
 # fi
 
 if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then
-  # # split vox2 into 2 parts, for cohort and qmf training
+  # split vox2 into 2 parts, for cohort and qmf training
   split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train
-  #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train
 fi
diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md
index c64a4b41..a005b6e8 100644
--- a/egs/voxceleb/v2/README.md
+++ b/egs/voxceleb/v2/README.md
@@ -78,6 +78,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 |
 | | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 |
 | | | | Cosine + QMF | 0.75 | 0.054 | 0.086 |
+| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 |
+| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 |
+| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 |
+| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 |
+| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 |
+| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 |
+| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 |
+| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 |
+| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 |
+| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 |
+| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 |
+| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 |
+| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 |
+| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 |
+| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 |
 
 ### VoxCeleb 1 Entire-Clean trial list
 
@@ -86,6 +101,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 |
 | | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 |
 | | | | Cosine + QMF | 0.75 | 0.046 | 0.076 |
+| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 |
+| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 |
+| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 |
+| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 |
+| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 |
+| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 |
+| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 |
+| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 |
+| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 |
+| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 |
+| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 |
+| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 |
+| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 |
+| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 |
+| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 |
 
 ### VoxCeleb 1 Hard-Clean trial list
 
@@ -94,6 +124,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 |
 | | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 |
 | | | | Cosine + QMF | 1.56 | 0.096 | 0.155 |
+| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 |
+| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 |
+| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 |
+| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 |
+| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 |
+| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 |
+| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 |
+| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 |
+| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 |
+| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 |
+| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 |
+| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 |
+| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 |
+| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 |
+| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 |
 
 ### VoxSRC2022 dev
 
@@ -102,3 +147,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 |
 | | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 |
 | | | | Cosine + QMF | 2.31 | 0.143 | 0.232 |
+| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 |
+| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 |
+| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 |
+| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 |
+| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 |
+| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 |
+| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 |
+| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 |
+| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 |
+| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 |
+| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 |
+| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 |
+| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 |
+| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 |
+| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 |
diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..ad991124
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..254ff796
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..52be6db5
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml
new file mode 100644
index 00000000..ebeedde6
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.45
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 4850
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-4
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 25
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
index eed0ad1f..69a8322b 100644
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
@@ -58,6 +58,6 @@ trainer:
     update_lr_on_opt_step: true
   use_amp: true
   log_interval: 1000
-  epochs: 30
+  epochs: 8
   eff_batch_size: 512
   train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
index d66d6877..3443591a 100644
--- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
@@ -68,6 +68,6 @@ trainer:
     update_lr_on_opt_step: true
   use_amp: true
   log_interval: 1000
-  epochs: 8
+  epochs: 4
   eff_batch_size: 256
   train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..abe5da6e
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..7287188c
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..3443591a
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..2addaa1e
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmlarge_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..69a8322b
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..5e1260ad
--- /dev/null
+++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..c3466259
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+  drop_layers_gt: 12
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..dc3737e3
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..5025f047
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-large
+  drop_layers_gt: 12
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..0a6303f5
--- /dev/null
+++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-large
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..67a4665e
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# Wav2Vec2 Multilingual 300M params layers 2-12
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m12l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..80ee785b
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# Wav2Vec2 Multilingual 300M params
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..c2b30f68
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmbaseplus9l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..530096cc
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmlarge12l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..1b276bcd
--- /dev/null
+++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmlarge
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh
index 06c30779..b8aad6c8 100755
--- a/hyp_utils/create_data_split_dirs.sh
+++ b/hyp_utils/create_data_split_dirs.sh
@@ -6,7 +6,7 @@
 
 storage_name=$(date +'%m_%d_%H_%M')
 
-echo "$0 $@"  # Print the command line for logging
+
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
@@ -15,6 +15,7 @@ if [ $# -ne 3 ]; then
   echo "Usage: $0 <output-dir> <storage-dir> <nodes>"
   echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0"
 fi
+
 output_dir=$1
 storage_dir=$2
 nodes=$3
@@ -22,6 +23,7 @@ nodes=$3
 link_dir=$output_dir/storage
 
 if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then
+  echo "$0 $@"  # Print the command line for logging
   echo "Prepare to distribute data over multiple $nodes nodes"
   dir_name=$storage_dir/$storage_name/storage
   if [ "$nodes" == "b0" ];then
diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh
index fb5b8ca0..8416742e 100755
--- a/hyp_utils/create_data_split_links.sh
+++ b/hyp_utils/create_data_split_links.sh
@@ -6,11 +6,11 @@
 
 storage_name=$(date +'%m_%d_%H_%M')
 
-echo "$0 $@"  # Print the command line for logging
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 <output-file-pattern> < <num-jobs>"
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <output-file-pattern> <num-jobs>"
   echo "$0 exp/vad_dir/vad.JOB.ark 40"
 fi
+echo "$0 $@"  # Print the command line for logging
 output_file_pattern=$1
 nj=$2
 
diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh
index 373fc4a6..16ddbf74 100755
--- a/hyp_utils/feats/make_evad.sh
+++ b/hyp_utils/feats/make_evad.sh
@@ -87,7 +87,7 @@ fi
 $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \
     hyp_utils/conda_env.sh \
     compute_energy_vad.py --cfg $vad_config $opt_args \
-    --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \
+    --recordings-file $scp --output-spec ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \
     --part-idx JOB --num-parts $nj || exit 1
 
 # concatenate the .scp files together.
diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py
index e9773fff..9d50388c 100755
--- a/hyperion/bin/compute_energy_vad.py
+++ b/hyperion/bin/compute_energy_vad.py
@@ -13,19 +13,31 @@
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.feats import EnergyVAD
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
-def compute_vad(input_path, output_path, write_num_frames, **kwargs):
+def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs):
 
     vad_args = EnergyVAD.filter_args(**kwargs)
     vad = EnergyVAD(**vad_args)
 
     input_args = AR.filter_args(**kwargs)
-    reader = AR(input_path, **input_args)
+    reader = AR(recordings_file, **input_args)
 
-    writer = DWF.create(output_path)
+    metadata_columns = [
+        "frame_shift",
+        "frame_length",
+        "num_frames",
+        "num_speech_frames",
+        "prob_speech",
+    ]
+
+    writer = DWF.create(output_spec, metadata_columns=metadata_columns)
 
     if write_num_frames is not None:
         f_num_frames = open(write_num_frames, "w")
@@ -39,6 +51,7 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs):
         rtf = vad.frame_shift * y.shape[0] / dt
         num_speech_frames = np.sum(y)
         prob_speech = num_speech_frames / y.shape[0] * 100
+
         logging.info(
             "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f",
             key,
@@ -48,7 +61,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs):
             dt,
             rtf,
         )
-        writer.write([key], [y])
+        metadata = {
+            "frame_shift": vad.frame_shift,
+            "frame_length": vad.frame_length,
+            "num_frames": y.shape[0],
+            "num_speech_frames": num_speech_frames,
+            "prob_speech": prob_speech,
+        }
+        writer.write([key], [y], metadata)
         if write_num_frames is not None:
             f_num_frames.write("%s %d\n" % (key, y.shape[0]))
 
@@ -63,9 +83,10 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs):
     parser = ArgumentParser(description="Compute Kaldi Energy VAD")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--input", dest="input_path", required=True)
-    parser.add_argument("--output", dest="output_path", required=True)
+    parser.add_argument("--recordings-file", required=True)
+    parser.add_argument("--output-spec", required=True)
     parser.add_argument("--write-num-frames", default=None)
+    parser.add_argument("--write-stats", default=None)
 
     AR.add_class_args(parser)
     EnergyVAD.add_class_args(parser)
diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py
index 6adf78b2..26f77112 100644
--- a/hyperion/io/ark_data_writer.py
+++ b/hyperion/io/ark_data_writer.py
@@ -3,10 +3,10 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-from typing import Union, Optional, List
+from typing import Union, Optional, List, Dict
 
 import numpy as np
-
+import pandas as pd
 from ..hyp_defs import float_save
 from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token
 from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix
@@ -46,7 +46,10 @@ def __init__(
             self.f = open(archive_path, "w")
 
         if script_path is not None and not self.script_is_scp:
-            row = self.script_sep.join(["id", "storage_path", "storage_byte"])
+            columns = ["id", "storage_path", "storage_byte"]
+            if self.metadata_columns is not None:
+                columns += self.metadata_columns
+            row = self.script_sep.join(columns)
             self.f_script.write(f"{row}\n")
 
     def __exit__(self, exc_type, exc_value, traceback):
@@ -97,6 +100,7 @@ def write(
         self,
         keys: Union[str, List[str], np.array],
         data: Union[np.array, List[np.array]],
+        metadata: Optional[Union[pd.DataFrame, Dict]] = None,
     ):
         """Writes data to file.
 
@@ -107,9 +111,7 @@ def write(
                 it can be a 3D numpy array.
                 If they are vectors, it can be a 2D numpy array.
         """
-        if isinstance(keys, str):
-            keys = [keys]
-            data = [data]
+        keys, data, metadata = self.standardize_write_args(keys, data, metadata)
 
         for i, key_i in enumerate(keys):
             assert is_token(key_i), "Token %s not valid" % key_i
@@ -125,7 +127,11 @@ def write(
                 if self.script_is_scp:
                     self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n")
                 else:
-                    row = self.script_sep.join([key_i, self.archive_path, str(pos)])
+                    columns = [key_i, str(self.archive_path), str(pos)]
+                    if metadata is not None:
+                        metadata_i = [str(m[i]) for m in metadata]
+                        columns += metadata_i
+                    row = self.script_sep.join(columns)
                     self.f_script.write(f"{row}\n")
 
             if self._flush:
diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py
index 1052ce8c..6c152cc5 100644
--- a/hyperion/io/audio_reader.py
+++ b/hyperion/io/audio_reader.py
@@ -346,7 +346,9 @@ def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float =
                 key = segment["id"]
                 x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
             else:
-                key, file_path = self.recordings.iloc[self.cur_item]
+                segment = self.recordings.iloc[self.cur_item]
+                key = segment["id"]
+                file_path = segment["storage_path"]
                 x_i, fs_i = self.read_wavspecifier(
                     file_path, self.wav_scale, offset_i, dur_i
                 )
@@ -397,7 +399,8 @@ def add_class_args(parser, prefix: Optional[str] = None):
 
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix, action=ActionParser(parser=parser),
+                "--" + prefix,
+                action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
@@ -423,7 +426,7 @@ def read(
         Args:
           keys: List of recording/segment_ids names.
           time_offset: float or float list with time-offsets
-          time_durs: float or float list with durations 
+          time_durs: float or float list with durations
 
         Returns:
           data: List of waveforms
@@ -527,7 +530,8 @@ def add_class_args(parser, prefix: Optional[str] = None):
         )
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix, action=ActionParser(parser=parser),
+                "--" + prefix,
+                action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py
index b56e8c27..092f5549 100644
--- a/hyperion/io/data_rw_factory.py
+++ b/hyperion/io/data_rw_factory.py
@@ -30,7 +30,10 @@ class DataWriterFactory(object):
 
     @staticmethod
     def create(
-        wspecifier: PathLike, compress: bool = False, compression_method: str = "auto"
+        wspecifier: PathLike,
+        compress: bool = False,
+        compression_method: str = "auto",
+        metadata_columns: Optional[List[str]] = None,
     ):
         if isinstance(wspecifier, str):
             wspecifier = WSpecifier.create(wspecifier)
@@ -47,6 +50,7 @@ def create(
                     flush=wspecifier.flush,
                     compress=compress,
                     compression_method=compression_method,
+                    metadata_columns=metadata_columns,
                 )
             else:
                 return ADW(
@@ -56,6 +60,7 @@ def create(
                     flush=wspecifier.flush,
                     compress=compress,
                     compression_method=compression_method,
+                    metadata_columns=metadata_columns,
                 )
 
     @staticmethod
@@ -76,7 +81,6 @@ def add_class_args(parser, prefix: Optional[PathLike] = None):
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
-            # help='data writer options')
 
 
 class SequentialDataReaderFactory(object):
diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py
index 8adbf87a..ff35ef2a 100644
--- a/hyperion/io/data_writer.py
+++ b/hyperion/io/data_writer.py
@@ -5,9 +5,10 @@
 
 import os
 from abc import ABCMeta, abstractmethod
-from typing import Union, Optional, List
+from typing import Union, Optional, List, Dict
 from pathlib import Path
 import numpy as np
+import pandas as pd
 from ..utils import PathLike
 
 
@@ -34,12 +35,14 @@ def __init__(
         flush: bool = False,
         compress: bool = False,
         compression_method: str = "auto",
+        metadata_columns: Optional[List[str]] = None,
     ):
         self.archive_path = Path(archive_path)
         self.script_path = Path(script_path) if script_path is not None else None
         self._flush = flush
         self.compress = compress
         self.compression_method = compression_method
+        self.metadata_columns = metadata_columns
 
         archive_dir = self.archive_path.parent
         archive_dir.mkdir(exist_ok=True, parents=True)
@@ -56,9 +59,7 @@ def __init__(
                 self.f_script = open(self.script_path, "w")
             else:
                 self.script_sep = "," if script_ext == ".csv" else "\t"
-                self.f_script = open(self.script_path, "w", "utf-8")
-                row = self.script_sep.join(["id", "storage_path"])
-                self.f_script.write(f"{row}\n")
+                self.f_script = open(self.script_path, "w", encoding="utf-8")
 
     def __enter__(self):
         """Function required when entering contructions of type
@@ -87,11 +88,37 @@ def flush(self):
         """Flushes the file"""
         pass
 
+    def standardize_write_args(
+        self,
+        keys: Union[str, List[str], np.array],
+        data: Union[np.array, List[np.array]],
+        metadata: Optional[Union[pd.DataFrame, Dict]] = None,
+    ):
+        if isinstance(keys, str):
+            keys = [keys]
+            data = [data]
+
+        if metadata is not None:
+            if isinstance(metadata, pd.DataFrame):
+                metadata = metadata.to_dict()
+
+            metadata_list = []
+            for c in self.metadata_columns:
+                m_c = metadata[c]
+                if not isinstance(m_c, (list, np.ndarray)):
+                    m_c = [m_c]
+                metadata_list.append(m_c)
+
+            metadata = metadata_list
+
+        return keys, data, metadata
+
     @abstractmethod
     def write(
         self,
         keys: Union[str, List[str], np.array],
         data: Union[np.array, List[np.array]],
+        metadata: Optional[Union[pd.DataFrame, Dict]] = None,
     ):
         """Writes data to file.
 
@@ -101,5 +128,6 @@ def write(
                 If all the matrices have the same dimension
                 it can be a 3D numpy array.
                 If they are vectors, it can be a 2D numpy array.
+          metadata: dictionary/DataFrame with metadata
         """
         pass
diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py
index c34aa0ca..4d05f963 100644
--- a/hyperion/io/h5_data_writer.py
+++ b/hyperion/io/h5_data_writer.py
@@ -3,10 +3,11 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-from typing import Union, Optional, List
+from typing import Union, Optional, List, Dict
 
 import h5py
 import numpy as np
+import pandas as pd
 
 from ..hyp_defs import float_save
 from ..utils.kaldi_io_funcs import is_token
@@ -37,7 +38,10 @@ def __init__(
 
         self.f = h5py.File(archive_path, "w")
         if script_path is not None and not self.script_is_scp:
-            row = self.script_sep.join(["id", "storage_path"])
+            columns = ["id", "storage_path"]
+            if self.metadata_columns is not None:
+                columns += self.metadata_columns
+            row = self.script_sep.join(columns)
             self.f_script.write(f"{row}\n")
 
     def __exit__(self, exc_type, exc_value, traceback):
@@ -89,6 +93,7 @@ def write(
         self,
         keys: Union[str, List[str], np.array],
         data: Union[np.array, List[np.array]],
+        metadata: Optional[Union[pd.DataFrame, Dict]] = None,
     ):
         """Writes data to file.
 
@@ -99,9 +104,7 @@ def write(
                 it can be a 3D numpy array.
                 If they are vectors, it can be a 2D numpy array.
         """
-        if isinstance(keys, str):
-            keys = [keys]
-            data = [data]
+        keys, data, metadata = self.standardize_write_args(keys, data, metadata)
 
         for i, key_i in enumerate(keys):
             assert is_token(key_i), "Token %s not valid" % key_i
@@ -115,7 +118,11 @@ def write(
                 if self.script_is_scp:
                     self.f_script.write(f"{key_i} {self.archive_path}\n")
                 else:
-                    row = self.script_sep.join([key_i, self.archive_path])
+                    columns = [key_i, str(self.archive_path)]
+                    if metadata is not None:
+                        metadata_i = [str(m[i]) for m in metadata]
+                        columns += metadata_i
+                    row = self.script_sep.join(columns)
                     self.f_script.write(f"{row}\n")
 
             if self._flush:
diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py
index 37f579b4..93123247 100644
--- a/hyperion/io/rw_specifiers.py
+++ b/hyperion/io/rw_specifiers.py
@@ -7,6 +7,8 @@
 
 import re
 from enum import Enum
+from pathlib import Path
+import pandas as pd
 
 
 class ArchiveType(Enum):
@@ -174,6 +176,11 @@ def create(cls, wspecifier):
                     archive_type = ArchiveType.AUDIO
                     archive = archives[cur_archive]
                     cur_archive += 1
+                elif option == "csv":
+                    assert script is None, "Repeated csv in wspecifier %s" % script
+                    assert len(archives) > cur_archive
+                    script = archives[cur_archive]
+                    cur_archive += 1
                 elif option == "scp":
                     assert script is None, "Repeated scp in wspecifier %s" % script
                     assert len(archives) > cur_archive
@@ -332,7 +339,7 @@ def create(cls, rspecifier):
             assert len(archives) == 1
 
             spec_type = None
-            archive = archives[0]
+            archive = Path(archives[0])
             archive_type = None
             once = False
             is_sorted = False
@@ -361,6 +368,9 @@ def create(cls, rspecifier):
                     assert spec_type is None
                     spec_type = RSpecType.ARCHIVE
                     archive_type = ArchiveType.RTTM
+                elif option == "csv":
+                    assert spec_type is None
+                    spec_type = RSpecType.SCRIPT
                 elif option == "scp":
                     assert spec_type is None
                     spec_type = RSpecType.SCRIPT
@@ -374,24 +384,31 @@ def create(cls, rspecifier):
             assert spec_type is not None, "Wrong wspecifier options %s" % fields[0]
 
             if spec_type == RSpecType.SCRIPT:
-                with open(archive, "r") as f:
-                    scp_f2 = f.readline().strip().split(" ")[1]
-                    if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None:
+                if archive.suffix == ".csv":
+                    df = pd.read_csv(archive, nrows=2)
+                    storage_path = df["storage_path"].values[0]
+                    if re.match(r".*\.h5$", scp_f2) is not None:
                         archive_type = ArchiveType.H5
-                    elif re.match(r".*\.ark:.*$", scp_f2) is not None:
+                    elif re.match(r".*\.ark$", scp_f2) is not None:
                         archive_type = ArchiveType.ARK
-                    elif (
-                        re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None
-                    ):
+                    elif re.match(r".*[cvg]$", scp_f2) is not None:
                         archive_type = ArchiveType.AUDIO
                     else:
-                        archive_type = ArchiveType.ARK
-
-                    # .split('[')[0].split(':')
-                    # if len(scp) == 1:
-                    #     archive_type = ArchiveType.H5
-                    # else:
-                    #     archive_type = ArchiveType.ARK
+                        raise ValueError(f"Unknown format for {storage_path}")
+                else:
+                    with open(archive, "r") as f:
+                        scp_f2 = f.readline().strip().split(" ")[1]
+                        if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None:
+                            archive_type = ArchiveType.H5
+                        elif re.match(r".*\.ark:.*$", scp_f2) is not None:
+                            archive_type = ArchiveType.ARK
+                        elif (
+                            re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2)
+                            is not None
+                        ):
+                            archive_type = ArchiveType.AUDIO
+                        else:
+                            archive_type = ArchiveType.ARK
 
             if archive_type == ArchiveType.ARK:
                 for option in options:
diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py
index 51b476aa..e8ad5056 100644
--- a/hyperion/utils/__init__.py
+++ b/hyperion/utils/__init__.py
@@ -3,6 +3,7 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
+from .info_table import InfoTable
 from .class_info import ClassInfo
 from .dataset import Dataset
 from .enrollment_map import EnrollmentMap

From 26eca97bdab59182bc00f29a5a55294988f46d04 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-72-54.ec2.internal>
Date: Mon, 7 Aug 2023 18:10:21 +0000
Subject: [PATCH 72/89] add configs for commonvoice speaker verification

---
 ..._speaker_ecapatdnn512x3_stage1_v1.3.1.yaml | 70 +++++++++++++++++++
 ...0m_speaker_ecapatdnn512x3_stage1_v1.3.yaml | 70 +++++++++++++++++++
 ...0m_speaker_ecapatdnn512x3_stage2_v1.3.yaml | 70 +++++++++++++++++++
 ...2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml | 44 ++++++++++++
 .../global_conf/config_spk_v1.3.1_13langs.sh  | 42 +++++++++++
 .../v1/global_conf/config_spk_v1.3_13langs.sh | 42 +++++++++++
 hyperion/bin/finetune_wav2vec2xvector.py      |  5 ++
 7 files changed, 343 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml
 create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml
new file mode 100644
index 00000000..b03a0282
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml
@@ -0,0 +1,70 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'seg_chunk_sampler'
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # # weighted
+      # weight_mode: "data-prior"
+      # class_name: "language"
+      # weight_exponent: 1.0
+      # num_chunks_per_seg_epoch: 0.3
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - speaker
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'seg_chunk_sampler'
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # # weighted
+      # weight_mode: "data-prior"
+      # class_name: "language"
+      # weight_exponent: 1.0
+      # num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml
new file mode 100644
index 00000000..523bf6fd
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml
@@ -0,0 +1,70 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'seg_chunk_sampler'
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # # weighted
+      # weight_mode: "data-prior"
+      # class_name: "language"
+      # weight_exponent: 1.0
+      # num_chunks_per_seg_epoch: 0.3
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - speaker
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'seg_chunk_sampler'
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # # weighted
+      # weight_mode: "data-prior"
+      # class_name: "language"
+      # weight_exponent: 1.0
+      # num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 28000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml
new file mode 100644
index 00000000..39b94671
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml
@@ -0,0 +1,70 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 0.3
+      class_name: language
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - speaker
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 1.0
+      class_name: language
+      seg_weight_mode: uniform
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 4
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 28000
+    hold_steps: 20000
+    min_lr: 4e-4
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 40
+  eff_batch_size: 512
+  train_mode: full
\ No newline at end of file
diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml
new file mode 100644
index 00000000..1abfea29
--- /dev/null
+++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 5
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh
new file mode 100644
index 00000000..d820ac2d
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio_overlap_spk
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v1.3.1_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0040.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v1.3.1_13_langs.s2
+nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.1.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh
new file mode 100644
index 00000000..2e583f03
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio_overlap_spk
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v1.3_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0040.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v1.3_13_langs.s2
+nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py
index fc3c7084..3bc2fae4 100755
--- a/hyperion/bin/finetune_wav2vec2xvector.py
+++ b/hyperion/bin/finetune_wav2vec2xvector.py
@@ -26,6 +26,10 @@
 from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
                           namespace_to_dict)
 
+import warnings
+
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d')
+
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
     "hf_hubert2resnet1d": HFHubert2ResNet1dXVector,
@@ -126,6 +130,7 @@ def train_model(gpu_id, args):
         device=device,
         metrics=metrics,
         ddp=world_size > 1,
+        # loss_weight=train_loader.batch_sampler.class_info["weights"],
         **trn_args,
     )
     trainer.load_last_checkpoint()

From 89efce43a3c25b1fc3284afb84823af803d92add Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Mon, 4 Sep 2023 18:59:26 -0400
Subject: [PATCH 73/89] voxceleb v1.2 works up to snorm backend

---
 egs/sre19-av-v/v0.1/steps_be/face_be_utils.py |   9 +-
 .../v1/steps_be/eval-tel-be-snorm-v2.py       |   2 +-
 egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py   |   2 +-
 .../v1/steps_be/train-tel-be-knn-v1.py        |   2 +-
 .../v1/steps_be/train-tel-be-knn-v3.py        |   2 +-
 .../v1/steps_be/train-tel-be-knn-v4.py        |   2 +-
 .../adv.v2/steps_backend/eval-be-cos-Nvs1.py  |   2 +-
 .../adv.v2/steps_backend/eval-be-cos.py       |   2 +-
 egs/voxceleb/v1.1/conf/vad_16k.yaml           |   1 +
 ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml |  84 +--
 ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml |  16 +-
 ...train_ecapatdnn512x3_xvec_stage1_v3.0.yaml |  89 +--
 ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml |  30 +-
 egs/voxceleb/v1.2/conf/vad_16k.yaml           |   3 +-
 egs/voxceleb/v1.2/run_002_compute_evad.sh     |  66 +++
 .../v1.2/run_003_prepare_noises_rirs.sh       | 102 ++++
 .../v1.2/run_004_prepare_xvec_train_data.sh   |  76 +++
 egs/voxceleb/v1.2/run_005_train_xvector.sh    |  78 +++
 egs/voxceleb/v1.2/run_006_extract_xvectors.sh | 103 ++++
 egs/voxceleb/v1/steps_be/eval_be_cos.py       |   2 +-
 egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py   |   2 +-
 egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py |   2 +-
 hyp_utils/create_audios_split_links.sh        |  27 +
 hyp_utils/create_data_split_links.sh          |   2 -
 .../xvectors/extract_wav2vec2xvectors.sh      |   8 +-
 .../xvectors/extract_xvectors_from_wav.sh     |  10 +-
 .../make_babble_noise_for_nnet_train.sh       |  22 +-
 .../xvectors/pack_rirs_for_nnet_train.sh      |   9 -
 .../preprocess_audios_for_nnet_train.sh       |   8 +-
 hyperion/bin/eval_cosine_scoring_backend.py   | 200 +++++++
 .../eval_cosine_scoring_backend_with_qmf.py   | 472 +++++++++++++++
 hyperion/bin/eval_verification_metrics.py     |  96 +++
 hyperion/bin/eval_xvec_logits_from_wav.py     |  20 +-
 hyperion/bin/extract_wav2vec2xvectors.py      |  41 +-
 hyperion/bin/extract_wav2xvectors.py          | 333 +++++++++++
 hyperion/bin/extract_xvectors_from_feats.py   |  20 +-
 hyperion/bin/extract_xvectors_from_wav.py     |  26 +-
 .../extract_xvectors_slidwin_from_feats.py    |  10 +-
 .../bin/extract_xvectors_slidwin_from_wav.py  |  10 +-
 hyperion/bin/finetune_wav2xvector.py          | 228 ++++++++
 .../generate_adv_attacks_xvector_classif.py   |   8 +-
 hyperion/bin/hyperion_dataset.py              | 406 ++++++++++++-
 hyperion/bin/hyperion_tables.py               |  33 +-
 hyperion/bin/make_babble_noise_audio_files.py | 102 ++--
 hyperion/bin/make_wav2xvector.py              |  91 +++
 hyperion/bin/merge_scores.py                  |  99 ++++
 hyperion/bin/pack_wav_rirs.py                 |  17 +-
 hyperion/bin/plot_embedding_tsne_per_class.py |  11 +-
 hyperion/bin/prepare_data.py                  |   9 +-
 hyperion/bin/preprocess_audio_files.py        | 163 +++---
 hyperion/bin/train_wav2vec2xvector.py         |  19 +-
 hyperion/bin/train_wav2xvector.py             | 196 +++++++
 hyperion/data_prep/__init__.py                |   2 +
 hyperion/data_prep/data_prep.py               |   3 +-
 hyperion/data_prep/musan.py                   | 107 ++++
 hyperion/data_prep/rirs.py                    | 103 ++++
 hyperion/data_prep/voxceleb1.py               |  18 +-
 hyperion/data_prep/voxceleb2.py               |  23 +-
 hyperion/data_prep/voxsrc22.py                |  49 +-
 hyperion/helpers/trial_data_reader.py         |   2 +-
 hyperion/helpers/vector_class_reader.py       |   2 +-
 hyperion/io/ark_data_reader.py                |   6 +-
 hyperion/io/audio_reader.py                   |  18 +-
 hyperion/io/audio_writer.py                   |  71 ++-
 hyperion/io/hyp_data_reader.py                |   5 +-
 hyperion/io/packed_audio_reader.py            |   6 +-
 hyperion/io/rw_specifiers.py                  |   6 +-
 hyperion/np/augment/noise_augment.py          |  26 +-
 hyperion/np/augment/reverb_augment.py         |  15 +-
 hyperion/np/augment/speech_augment.py         |   2 +-
 hyperion/np/augment/speed_augment.py          |  22 +-
 .../classifiers/binary_logistic_regression.py |   2 +-
 hyperion/np/classifiers/greedy_fusion.py      |   4 +-
 hyperion/np/classifiers/linear_gbe.py         |   8 +-
 hyperion/np/classifiers/linear_gbe_up.py      |   9 +-
 hyperion/np/classifiers/linear_svmc.py        |   8 +-
 .../np/classifiers/logistic_regression.py     |   6 +-
 hyperion/np/classifiers/q_scoring_homo_gbe.py |   2 +-
 hyperion/np/classifiers/svmc.py               |   4 +-
 hyperion/np/feats/energy_vad.py               |  40 +-
 hyperion/np/feats/mfcc.py                     |  57 +-
 hyperion/np/metrics/__init__.py               |   7 +-
 hyperion/np/metrics/cllr.py                   |   2 +-
 hyperion/np/metrics/utils.py                  |   2 +-
 hyperion/np/metrics/verification_evaluator.py |  78 ++-
 hyperion/np/pdfs/core/normal.py               |  21 +-
 hyperion/np/pdfs/core/normal_diag_cov.py      |  11 +-
 hyperion/np/pdfs/hmm/hmm.py                   |   4 +-
 hyperion/np/pdfs/jfa/jfa_total.py             |   9 +-
 .../np/pdfs/mixtures/exp_family_mixture.py    |   2 +-
 hyperion/np/pdfs/mixtures/gmm.py              |  24 +-
 hyperion/np/pdfs/mixtures/gmm_diag_cov.py     |  13 +-
 .../np/pdfs/mixtures/gmm_tied_diag_cov.py     |  13 +-
 hyperion/np/pdfs/plda/frplda.py               |   4 +-
 hyperion/np/pdfs/plda/plda.py                 |   4 +-
 hyperion/np/pdfs/plda/splda.py                |   4 +-
 hyperion/np/transforms/skl_tsne.py            |   4 +-
 hyperion/torch/data/audio_dataset.py          |  12 +
 hyperion/torch/layers/audio_feats_factory.py  |   2 +-
 hyperion/torch/models/__init__.py             |  18 +-
 hyperion/torch/models/plda/splda.py           |   2 +-
 .../models/wav2xvectors/hf_wav2xvector.py     |   4 +-
 .../wav2xvectors/wav2resnet1d_xvector.py      |  18 +
 .../models/wav2xvectors/wav2resnet_xvector.py |  18 +
 .../torch/models/wav2xvectors/wav2xvector.py  | 113 +++-
 hyperion/torch/narchs/audio_feats_mvn.py      |   4 +
 hyperion/torch/torch_model.py                 |  19 +-
 hyperion/utils/class_info.py                  |  16 +
 hyperion/utils/dataset.py                     | 552 +++++++++++++-----
 hyperion/utils/fold_list.py                   |   2 +-
 hyperion/utils/info_table.py                  |  72 ++-
 hyperion/utils/{math.py => math_funcs.py}     |  22 +-
 hyperion/utils/plotting.py                    |   3 +-
 hyperion/utils/scp_list.py                    |   2 +-
 hyperion/utils/segment_set.py                 |  42 +-
 hyperion/utils/sparse_trial_key.py            |  18 +-
 hyperion/utils/sparse_trial_scores.py         | 124 +++-
 hyperion/utils/train_val_eval_list.py         |   2 +-
 hyperion/utils/trial_key.py                   |  16 +-
 hyperion/utils/trial_ndx.py                   |  84 ++-
 hyperion/utils/trial_scores.py                |  86 ++-
 hyperion/utils/utt2info.py                    |   2 +-
 122 files changed, 4509 insertions(+), 945 deletions(-)
 create mode 100755 egs/voxceleb/v1.2/run_002_compute_evad.sh
 create mode 100755 egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
 create mode 100755 egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
 create mode 100755 egs/voxceleb/v1.2/run_005_train_xvector.sh
 create mode 100755 egs/voxceleb/v1.2/run_006_extract_xvectors.sh
 create mode 100755 hyp_utils/create_audios_split_links.sh
 create mode 100755 hyperion/bin/eval_cosine_scoring_backend.py
 create mode 100755 hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
 create mode 100755 hyperion/bin/eval_verification_metrics.py
 create mode 100755 hyperion/bin/extract_wav2xvectors.py
 create mode 100755 hyperion/bin/finetune_wav2xvector.py
 mode change 100644 => 100755 hyperion/bin/hyperion_dataset.py
 create mode 100755 hyperion/bin/make_wav2xvector.py
 create mode 100755 hyperion/bin/merge_scores.py
 create mode 100755 hyperion/bin/train_wav2xvector.py
 create mode 100644 hyperion/data_prep/musan.py
 create mode 100644 hyperion/data_prep/rirs.py
 rename hyperion/utils/{math.py => math_funcs.py} (93%)

diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py
index 14e3fc20..b6252df7 100644
--- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py
+++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py
@@ -2,15 +2,11 @@
   Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
   Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
 """
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
 import logging
 import numpy as np
 
 from hyperion.utils.utt2info import Utt2Info
-from hyperion.utils.math import softmax
+from hyperion.utils.math_funcs import softmax
 from hyperion.io import RandomAccessDataReaderFactory as DRF
 from hyperion.np.transforms import LNorm
 from hyperion.np.clustering import AHC
@@ -23,9 +19,6 @@ def lnorm(x):
 
 def cosine_scr(x1, x2):
 
-    # t = LNorm()
-    # x1 = t.predict(x1)
-    # x2 = t.predict(x2)
     x1 = lnorm(x1)
     x2 = lnorm(x2)
     return np.dot(x1, x2.T)
diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py
index 907509fd..c9657a66 100755
--- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py
+++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py
@@ -16,7 +16,7 @@
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils import TrialNdx, TrialScores
 from hyperion.helpers import TrialDataReader as TDR
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.np.pdfs import PLDA
 from hyperion.np.transforms import TransformList
 from hyperion.np.score_norm import AdaptSNorm as SNorm
diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py
index b661cbde..24ef731b 100755
--- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py
+++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py
@@ -15,7 +15,7 @@
 
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils import TrialNdx, TrialScores
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.np.pdfs import PLDA
 from hyperion.helpers import TrialDataReader as TDR
 from hyperion.helpers import PLDAFactory as F
diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py
index 8e7715e0..bdef3fc3 100755
--- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py
+++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py
@@ -17,7 +17,7 @@
 from hyperion.np.transforms import TransformList, PCA, LDA, LNorm
 from hyperion.helpers import PLDAFactory as F
 from hyperion.utils.utt2info import Utt2Info
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 
 from numpy.linalg import matrix_rank
 
diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py
index 12f1725b..51795676 100755
--- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py
+++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py
@@ -17,7 +17,7 @@
 from hyperion.np.transforms import TransformList, PCA, LDA, LNorm
 from hyperion.helpers import PLDAFactory as F
 from hyperion.utils.utt2info import Utt2Info
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 
 from numpy.linalg import matrix_rank, svd
 
diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py
index 234f966c..79c1cd6f 100755
--- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py
+++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py
@@ -17,7 +17,7 @@
 from hyperion.np.transforms import TransformList, PCA, LDA, LNorm
 from hyperion.helpers import PLDAFactory as F
 from hyperion.utils.utt2info import Utt2Info
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 
 from numpy.linalg import matrix_rank, svd
 
diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py
index 85e82149..48094d0f 100755
--- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py
+++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py
@@ -15,7 +15,7 @@
 
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils import TrialNdx, TrialScores
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.np.pdfs import PLDA
 from hyperion.helpers import TrialDataReader as TDR
 from hyperion.helpers import PLDAFactory as F
diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py
index d5cd6a55..49720cb5 100755
--- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py
+++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py
@@ -19,7 +19,7 @@
 
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils import TrialNdx, TrialScores
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.helpers import TrialDataReader as TDR
 from hyperion.helpers import PLDAFactory as F
 from hyperion.np.transforms import TransformList
diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml
index 5fb0111c..a8d7b4d4 100644
--- a/egs/voxceleb/v1.1/conf/vad_16k.yaml
+++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml
@@ -6,3 +6,4 @@ vad_energy_threshold: 5.5
 vad_energy_mean_scale: 0.5
 vad_proportion_threshold: 0.12
 vad_frames_context: 2
+wav_scale: 32767
diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml
index 1633f4a2..2cf31713 100644
--- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml
+++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml
@@ -29,48 +29,50 @@ data:
       min_chunk_length: 2.0
     data_loader:
       num_workers: 8
-feats: fbank80_specaug1_stmn_16k.yaml
-model: 
-  resnet_enc:
-    in_feats: 80
-    in_conv_channels: 2048
-    in_kernel_size: 5
-    in_stride: 1
-    resb_type: seres2bn
-    resb_repeats:
-    - 1
-    - 1
-    - 1
-    - 1
-    resb_channels:
-    - 2048
-    resb_kernel_sizes:
-    - 3
-    resb_dilations:
-    - 2
-    - 3
-    - 4
-    - 5
-    resb_strides:
-    - 1
-    res2net_width_factor: 1
-    res2net_scale: 8
-    se_r: 4
-    multilayer: true
-    multilayer_concat: true
-    endpoint_channels: 4096
-    norm_before: false
+
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_enc:
+      in_feats: 80
+      in_conv_channels: 2048
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+      - 1
+      - 1
+      - 1
+      - 1
+      resb_channels:
+      - 2048
+      resb_kernel_sizes:
+      - 3
+      resb_dilations:
+      - 2
+      - 3
+      - 4
+      - 5
+      resb_strides:
+      - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 4
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 4096
+      norm_before: false
+      dropout_rate: 0.2
+      hid_act: swish 
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
     dropout_rate: 0.2
-    hid_act: swish
-  pool_net:
-    pool_type: ch-wise-att-mean+stddev
-    inner_feats: 128
-  embed_dim: 192
-  cos_scale: 30.0
-  margin: 0.2
-  margin_warmup_epochs: 5.0
-  dropout_rate: 0.2
-  norm_before: false
+    norm_before: false
 trainer:
   optim: 
     opt_type: adam
diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml
index 877736b3..21f0db8b 100644
--- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml
+++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml
@@ -37,15 +37,15 @@ data:
       num_hard_prototypes: 8
     data_loader:
       num_workers: 8
-feats: fbank80_stmn_16k.yaml
 model:
-  cos_scale: 30.0
-  margin: 0.3
-  margin_warmup_epochs: 0
-  intertop_margin: 0.1
-  resnet_enc:
-    override_dropouts: true
-    dropout_rate: 0.25
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    resnet_enc:
+      override_dropouts: true
+      dropout_rate: 0.25
 trainer:
   optim:
     opt_type: sgd
diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml
index f15d453d..03a7f736 100644
--- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml
+++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
     sampler:
       sampler_type: seg_chunk_sampler
       min_batch_size: 64
@@ -17,11 +17,11 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
     sampler:
       sampler_type: seg_chunk_sampler
       min_batch_size: 64
@@ -29,47 +29,48 @@ data:
       min_chunk_length: 2.0
     data_loader:
       num_workers: 8
-feats: fbank80_specaug1_stmn_16k.yaml
-model: 
-  resnet_enc:
-    in_feats: 80
-    in_conv_channels: 512
-    in_kernel_size: 5
-    in_stride: 1
-    resb_type: seres2bn
-    resb_repeats:
-    - 1
-    - 1
-    - 1
-    resb_channels:
-    - 512
-    resb_kernel_sizes:
-    - 3
-    resb_dilations:
-    - 2
-    - 3
-    - 4
-    resb_strides:
-    - 1
-    res2net_width_factor: 1
-    res2net_scale: 8
-    se_r: 4
-    multilayer: true
-    multilayer_concat: true
-    endpoint_channels: 1536
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_enc:
+      in_feats: 80
+      in_conv_channels: 512
+      in_kernel_size: 5
+      in_stride: 1
+      resb_type: seres2bn
+      resb_repeats:
+      - 1
+      - 1
+      - 1
+      resb_channels:
+      - 512
+      resb_kernel_sizes:
+      - 3
+      resb_dilations:
+      - 2
+      - 3
+      - 4
+      resb_strides:
+      - 1
+      res2net_width_factor: 1
+      res2net_scale: 8
+      se_r: 4
+      multilayer: true
+      multilayer_concat: true
+      endpoint_channels: 1536
+      norm_before: false
+      dropout_rate: 0.002
+      hid_act: swish
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.0
     norm_before: false
-    dropout_rate: 0.002
     hid_act: swish
-  pool_net:
-    pool_type: ch-wise-att-mean+stddev
-    inner_feats: 128
-  embed_dim: 192
-  cos_scale: 30.0
-  margin: 0.2
-  margin_warmup_epochs: 5.0
-  dropout_rate: 0.0
-  norm_before: false
-  hid_act: swish
 trainer:
   optim: 
     opt_type: adam
@@ -91,3 +92,5 @@ trainer:
   log_interval: 1000
   epochs: 40
   eff_batch_size: 256
+  target_key: speaker
+  train_mode: full
diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml
index 45e55d97..9788bb7c 100644
--- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml
+++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml
@@ -2,18 +2,18 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
     sampler:
       sampler_type: class_weighted_random_seg_chunk_sampler
       min_batch_size: 64
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       seg_weight_mode: data-prior
       num_hard_prototypes: 8
     data_loader:
@@ -21,31 +21,31 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
     sampler:
       sampler_type: class_weighted_random_seg_chunk_sampler
       min_batch_size: 64
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       seg_weight_mode: data-prior
       num_hard_prototypes: 8
     data_loader:
       num_workers: 8
-feats: fbank80_stmn_16k.yaml
 model:
-  cos_scale: 30.0
-  margin: 0.3
-  margin_warmup_epochs: 0
-  intertop_margin: 0.1
-  resnet_enc:
-    override_dropouts: true
-    dropout_rate: 0.
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    resnet_enc:
+      override_dropouts: true
+      dropout_rate: 0.
 trainer:
   optim:
     opt_type: sgd
@@ -67,3 +67,5 @@ trainer:
   swa_start: 31
   swa_lr: 1e-4
   swa_anneal_epochs: 2
+  target_key: speaker
+  train_mode: full
diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml
index 5fb0111c..e5a6bb82 100644
--- a/egs/voxceleb/v1.2/conf/vad_16k.yaml
+++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml
@@ -2,7 +2,8 @@ sample_frequency: 16000
 frame_shift: 10
 frame_length: 25 
 snip_edges: false
-vad_energy_threshold: 5.5
+vad_energy_threshold: -4.89
 vad_energy_mean_scale: 0.5
 vad_proportion_threshold: 0.12
 vad_frames_context: 2
+wav_scale: 1
diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh
new file mode 100755
index 00000000..e7593df2
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright
+#                2018   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+nodes=fs01
+vad_dir=`pwd`/exp/vad_e
+vad_config=conf/vad_16k.yaml
+nj=40
+
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ -z "$vad_config" ];then
+  echo "We are not using VAD in this configuration"
+  exit 0
+fi
+
+if [ "$do_voxsrc22" == "true" ];then
+  extra_data="voxsrc22_dev"
+fi
+
+
+if [ $stage -le 1 ]; then
+  # Prepare to distribute data over multiple machines
+  # This only does something at CLSP grid
+  for name in voxceleb2cat_train voxceleb1_test $extra_data
+  do
+    hyp_utils/create_data_split_dirs.sh \
+      $vad_dir/$name \
+      $USER/hyp-data/voxceleb/v1.2/vad $nodes
+  done
+fi
+
+#Train datasets
+if [ $stage -le 2 ];then
+  for name in voxceleb2cat_train voxceleb1_test $extra_data
+  do
+    # This creates links to distribute data in CLSP grid
+    # If you are not at CLSP grid, it does nothing and can be deleted
+    hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj
+    echo "compute vad for $name"
+    $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \
+	       hyp_utils/conda_env.sh \
+	       compute_energy_vad.py --cfg $vad_config \
+	       --recordings-file data/$name/recordings.csv \
+	       --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \
+	       --part-idx JOB --num-parts $nj || exit 1
+
+    hyperion_tables.py cat \
+		       --table-type features \
+		       --output-file $vad_dir/$name/vad.csv --num-tables $nj
+    hyperion_dataset.py add_features \
+			--dataset data/$name \
+			--features-name vad \
+			--features-file $vad_dir/$name/vad.csv
+  done
+fi
+
+
diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
new file mode 100755
index 00000000..aed1dae4
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nj=10
+config_file=default_config.sh
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+# We prepare the noise files and RIR for online speech augmentation
+if [ $stage -le 1 ]; then
+  for name in noise music speech
+  do
+    prepare_data.py musan \
+		    --corpus-dir $musan_root \
+		    --subset $name \
+		    --output-dir data/musan_$name
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # # Prepare to distribute data over multiple machines
+  # # This only does something at CLSP grid
+  # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes
+
+  for name in musan_noise musan_music
+  do
+    input_data_dir=data/$name
+    output_data_dir=data/${name}_proc_audio
+    output_dir=exp/proc_audio/$name
+    $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \
+	       hyp_utils/conda_env.sh \
+	       preprocess_audio_files.py \
+	       --audio-format flac  \
+	       --part-idx JOB --num-parts $nj \
+	       --recordings-file $input_data_dir/recordings.csv \
+	       --output-path $output_dir \
+	       --output-recordings-file $output_dir/recordings.JOB.csv
+   
+    hyperion_tables.py cat \
+		       --table-type recordings \
+		       --output-file $output_dir/recordings.csv --num-tables $nj
+    hyperion_dataset.py set_recordings \
+			--dataset $input_data_dir \
+			--recordings-file $output_dir/recordings.csv \
+			--output-dataset $output_data_dir
+		     
+    
+  done
+fi
+
+if [ $stage -le 3 ]; then
+    # Create Babble noise from MUSAN speech files
+    for name in musan_speech
+    do
+      input_data_dir=data/$name
+      output_data_dir=data/${name}_babble
+      output_dir=exp/proc_audio/${name}_babble
+      $train_cmd $output_dir/log/make_babble_noise_${name}.log \
+		 hyp_utils/conda_env.sh \
+		 make_babble_noise_audio_files.py \
+		 --audio-format flac \
+		 --min-spks 3 --max-spks 10 --num-reuses 5 \
+		 --recordings-file $input_data_dir/recordings.csv \
+		 --output-path $output_dir \
+		 --output-recordings-file $output_data_dir/recordings.csv
+      hyperion_dataset.py make_from_recordings \
+			  --dataset $output_data_dir \
+			  --recordings-file $output_data_dir/recordings.csv
+    done
+fi
+
+if [ $stage -le 4 ]; then
+    if [ ! -d "RIRS_NOISES" ]; then
+      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+      unzip rirs_noises.zip
+    fi
+    prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom
+    prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom
+    prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real
+    for rirs in rirs_smallroom rirs_mediumroom rirs_real
+    do
+      output_dir=exp/rirs/$rirs
+      data_dir=data/$rirs
+      $train_cmd $output_dir/log/pack_rirs_${name}.log \
+		 hyp_utils/conda_env.sh \
+		 pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \
+		 --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1;
+      hyperion_dataset.py add_features --dataset $data_dir \
+			  --features-name rirs --features-file $output_dir/rirs.csv
+
+    done
+fi
+
diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
new file mode 100755
index 00000000..7649ff22
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+nodes=b1
+nj=40
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ $stage -le 1 ]; then
+  # Prepare to distribute data over multiple machines
+  # This only does something at CLSP grid
+  hyp_utils/create_data_split_dirs.sh \
+    exp/xvector_audios/$nnet_data \
+    $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes
+fi
+
+if [ $stage -le 2 ];then
+  output_dir=exp/proc_audio/$nnet_data
+  # This creates links to distribute data in CLSP grid
+  # If you are not at CLSP grid, it does nothing and can be deleted
+  hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac
+  if [ -n "$vad_config" ];then
+    vad_args="--vad csv:data/$nnet_data/vad.csv"
+    update_durs="--update-seg-durs"
+  fi
+  
+  $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \
+	     hyp_utils/conda_env.sh \
+	     preprocess_audio_files.py \
+	     --audio-format flac --remove-dc-offset $vad_args \
+	     --part-idx JOB --num-parts $nj \
+	     --recordings-file data/$nnet_data/recordings.csv \
+	     --output-path $output_dir \
+	     --output-recordings-file $output_dir/recordings.JOB.csv
+
+  hyperion_tables.py cat \
+		     --table-type recordings \
+		     --output-file $output_dir/recordings.csv --num-tables $nj
+
+  hyperion_dataset.py set_recordings $update_durs \
+		      --dataset data/$nnet_data \
+		      --recordings-file $output_dir/recordings.csv \
+		      --output-dataset data/${nnet_data}_proc_audio \
+		      --remove-features vad
+fi
+
+if [ $stage -le 3 ];then
+  hyperion_dataset.py remove_short_segments \
+		      --dataset data/${nnet_data}_proc_audio \
+		      --output-dataset data/${nnet_data}_filtered \
+		      --length-name duration --min-length 2.0
+
+  hyperion_dataset.py remove_classes_few_segments \
+		      --dataset data/${nnet_data}_filtered \
+		      --class-name speaker --min-segs 4
+fi
+
+if [ $stage -le 4 ];then
+  hyperion_dataset.py split_train_val \
+		      --dataset data/${nnet_data}_filtered \
+		      --val-prob 0.03 \
+		      --joint-classes speaker --min-train-samples 1 \
+		      --seed 1123581321 \
+		      --train-dataset data/${nnet_data}_xvector_train \
+		      --val-dataset data/${nnet_data}_xvector_val 
+fi
+
diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh
new file mode 100755
index 00000000..d2f31ea1
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright
+#                2019   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+ngpu=4
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_data_dir=data/${nnet_data}_xvector_train
+val_data_dir=data/${nnet_data}_xvector_val
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+# Network Training
+if [ $stage -le 1 ]; then
+  
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
+    --data.train.dataset.segments-file $train_data_dir/segments.csv \
+    --data.train.dataset.class-files $train_data_dir/speaker.csv \
+    --data.val.dataset.recordings-file $val_data_dir/recordings.csv \
+    --data.val.dataset.segments-file $val_data_dir/segments.csv \
+    --trainer.exp-path $nnet_s1_dir \
+    --num-gpus $ngpu \
+  
+fi
+
+
+# Large Margin Fine-tuning
+if [ $stage -le 2 ]; then
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
+    --data.train.dataset.segments-file $train_data_dir/segments.csv \
+    --data.train.dataset.class-files $train_data_dir/speaker.csv \
+    --data.val.dataset.recordings-file $val_data_dir/recordings.csv \
+    --data.val.dataset.segments-file $val_data_dir/segments.csv \
+    --in-model-file $nnet_s1 \
+    --trainer.exp-path $nnet_s2_dir \
+    --num-gpus $ngpu \
+  
+fi
diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh
new file mode 100755
index 00000000..09b8c8e9
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nnet_stage=2
+config_file=default_config.sh
+use_gpu=false
+xvec_chunk_length=120.0
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ "$use_gpu" == "true" ];then
+  xvec_args="--use-gpu --chunk-length $xvec_chunk_length"
+  xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G"
+  num_gpus=1
+else
+  xvec_cmd="$train_cmd --mem 12G"
+  num_gpus=0
+fi
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+elif [ $nnet_stage -eq 4 ];then
+  nnet=$nnet_s4
+  nnet_name=$nnet_s4_name
+elif [ $nnet_stage -eq 5 ];then
+  nnet=$nnet_s5
+  nnet_name=$nnet_s5_name
+elif [ $nnet_stage -eq 6 ];then
+  nnet=$nnet_s6
+  nnet_name=$nnet_s6_name
+fi
+
+xvector_dir=exp/xvectors/$nnet_name
+
+if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then
+  # Extract xvectors for training LDA/PLDA
+  nj=100
+  for name in voxceleb2cat_train
+  do
+    if [ -n "$vad_config" ];then
+      vad_args="--vad csv:data/$name/vad.csv"
+    fi
+    output_dir=$xvector_dir/$name
+    echo "Extracting x-vectors for $name"
+    $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
+	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	      extract_wav2xvectors.py ${xvec_args} ${vad_args} \
+	      --part-idx JOB --num-parts $nj  \
+	      --recordings-file data/$name/recordings.csv \
+	      --random-utt-length --min-utt-length 2 --max-utt-length 30 \
+	      --model-path $nnet  \
+	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
+    hyperion_tables.py cat \
+		       --table-type features \
+		       --output-file $output_dir/xvector.csv --num-tables $nj
+
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Extracts x-vectors for evaluation
+  nj=100
+  if [ "$do_voxsrc22" == "true" ];then
+    extra_data="voxsrc22_dev"
+  fi
+  for name in voxceleb1_test $extra_data
+  do
+    num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}')
+    nj=$(($num_segs < 100 ? $num_segs:100))
+    if [ -n "$vad_config" ];then
+      vad_args="--vad csv:data/$name/vad.csv"
+    fi
+    output_dir=$xvector_dir/$name
+    echo "Extracting x-vectors for $name"
+    $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
+	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	      extract_wav2xvectors.py ${xvec_args} ${vad_args} \
+	      --part-idx JOB --num-parts $nj  \
+	      --recordings-file data/$name/recordings.csv \
+	      --model-path $nnet  \
+	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
+    hyperion_tables.py cat \
+		       --table-type features \
+		       --output-file $output_dir/xvector.csv --num-tables $nj
+
+  done
+fi
+
+
diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py
index 1f9978ee..a9bc03d1 100755
--- a/egs/voxceleb/v1/steps_be/eval_be_cos.py
+++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py
@@ -20,7 +20,7 @@
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils.list_utils import ismember
 from hyperion.utils import TrialNdx, TrialScores
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.helpers import TrialDataReader as TDR
 from hyperion.helpers import PLDAFactory as F
 from hyperion.np.transforms import TransformList
diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py
index 7034126a..bf66d72b 100755
--- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py
+++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py
@@ -19,7 +19,7 @@
 
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils import TrialNdx, TrialScores, Utt2Info
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.np.pdfs import PLDA
 from hyperion.utils.list_utils import ismember
 from hyperion.helpers import TrialDataReader as TDR
diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py
index dad89ced..0eca769d 100755
--- a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py
+++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py
@@ -20,7 +20,7 @@
 from hyperion.hyp_defs import float_cpu, config_logger
 from hyperion.utils.list_utils import ismember
 from hyperion.utils import TrialNdx, TrialScores
-from hyperion.utils.math import cosine_scoring
+from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.helpers import TrialDataReader as TDR
 from hyperion.helpers import PLDAFactory as F
 from hyperion.np.transforms import TransformList
diff --git a/hyp_utils/create_audios_split_links.sh b/hyp_utils/create_audios_split_links.sh
new file mode 100755
index 00000000..7125a2c4
--- /dev/null
+++ b/hyp_utils/create_audios_split_links.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright
+#                2023   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+# Creates links to distrubute data into multiple nodes in clsp grid
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <output-dir> <recordings-file> <audio-format>"
+  echo "$0 exp/xvector_audios/voxceleb data/voxceleb/recordings.csv flac"
+fi
+echo "$0 $@"  # Print the command line for logging
+output_dir=$1
+rec_file=$2
+file_format=$3
+
+if [[ $(hostname -f) != *.clsp.jhu.edu ]]; then
+   exit 0
+fi
+
+for f in $(awk -F "," '$1!="id" { print $1}' $rec_file); do
+  # the next command does nothing unless $output_dir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  hyp_utils/create_data_link.pl $output_dir/$f.$file_format
+done
+
+
+
diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh
index 8416742e..c7cfa3eb 100755
--- a/hyp_utils/create_data_split_links.sh
+++ b/hyp_utils/create_data_split_links.sh
@@ -4,8 +4,6 @@
 # Apache 2.0.
 # Creates links to distrubute data into multiple nodes in clsp grid
 
-storage_name=$(date +'%m_%d_%H_%M')
-
 if [ $# -ne 2 ]; then
   echo "Usage: $0 <output-file-pattern> <num-jobs>"
   echo "$0 exp/vad_dir/vad.JOB.ark 40"
diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh
index 6c6f0fdf..d8ae2e55 100755
--- a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh
+++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh
@@ -87,9 +87,9 @@ if [ $stage -le 0 ];then
 	extract_wav2vec2xvectors.py \
 	${args} $write_speech_dur_opt \
 	--part-idx JOB --num-parts $nj \
-	--input $data_dir/wav.scp \
+	--recordings-file $data_dir/wav.scp \
 	--model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \
-	--output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp
+	--output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp
     set -e
 fi
 
@@ -109,9 +109,9 @@ if [ $stage -le 1 ];then
 		 extract_wav2vec2xvectors.py \
 		 ${args} $write_speech_dur_opt \
 		 --part-idx $i --num-parts $nj \
-		 --input $data_dir/wav.scp \
+		 --recordings-file $data_dir/wav.scp \
 		 --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \
-		 --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp &
+		 --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp &
 	fi
     done
     wait
diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh
index 0b5227cc..b763a25c 100755
--- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh
+++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh
@@ -87,10 +87,10 @@ if [ $stage -le 0 ];then
 	hyp_utils/conda_env.sh --num-gpus $num_gpus \
 	extract_xvectors_from_wav.py \
 	--feats $feat_config ${args} $write_num_frames_opt \
-	--part-idx JOB --num-parts $nj \
-	--input $data_dir/wav.scp \
+	--part-idx JOB --num-parts $nj  \
+	--recordings-file $data_dir/wav.scp \
 	--model-path $nnet_file --chunk-length $chunk_length \
-	--output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp
+	--output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp
     set -e
 fi
 
@@ -110,9 +110,9 @@ if [ $stage -le 1 ];then
 		 extract_xvectors_from_wav.py \
 		 --feats $feat_config ${args} $write_num_frames_opt \
 		 --part-idx $i --num-parts $nj \
-		 --input $data_dir/wav.scp \
+		 --recordings-file $data_dir/wav.scp \
 		 --model-path $nnet_file --chunk-length $chunk_length \
-		 --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp &
+		 --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp &
 	fi
     done
     wait
diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh
index 27c77454..4530ad3b 100755
--- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh
+++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh
@@ -8,9 +8,7 @@ nj=1
 cmd="run.pl"
 stage=0
 file_format=flac
-nodes=b1
 storage_name=$(date +'%m_%d_%H_%M')
-#proc_opts="--remove-dc-offset"
 min_spks=3
 max_spks=10
 num_reuses=5
@@ -23,10 +21,8 @@ if [ $# != 3 ]; then
   echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
   echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
   echo "Options: "
-  #echo "  --nj <nj>                                        # number of parallel jobs"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --file-format <str|flac>                         # Output file_format supported by soundfile (flac,ogg,wav,...)"
-  #echo "  --proc-opts <str|--remove-dc-offset>             # Extra arguments for proc-audio-files.py"
   echo "  --min-spks <int|3>                               # max number of spks per utterance"
   echo "  --max-spks <int|10>                              # max number of spks per utterance"
   echo "  --num-reuses <int|10>                            # number of times a signal is reused to create babble"
@@ -51,22 +47,12 @@ output_dir=$(utils/make_absolute.sh $dir)
 args=""
 $cmd $dir/log/make_babble_noise_${name}.log \
     hyp_utils/conda_env.sh \
-    make_babble_noise_audio_files.py ${args} \
-    --output-audio-format $file_format $args $proc_opts \
+    make_babble_noise_audio_files.py \
+    --audio-format $file_format $args $proc_opts \
     --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \
     --write-time-durs $data_out/utt2dur \
-    --input $data_in/wav.scp \
+    --recordings-file $data_in/wav.scp \
     --output-path $output_dir \
-    --output-script $data_out/wav.scp
-
-
-
-# for n in $(seq $nj); do
-#   cat $output_dir/wav.${name}.$n.scp || exit 1;
-# done > ${data_out}/wav.scp || exit 1
-
-# for n in $(seq $nj); do
-#   cat $output_dir/utt2dur.${name}.$n || exit 1;
-# done > ${data_out}/utt2dur || exit 1
+    --output-recordings-file $data_out/wav.scp
 
 echo "$0: Succeeded making babble noise for $name"
diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh
index c6634135..437cd208 100755
--- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh
+++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh
@@ -66,13 +66,4 @@ $cmd $dir/log/pack_rirs_${name}.log \
     pack_wav_rirs.py ${args} --input $data_in/wav.scp \
      --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1;
 
-
-# for n in $(seq $nj); do
-#   cat $output_dir/wav.${name}.$n.scp || exit 1;
-# done > ${data_out}/wav.scp || exit 1
-
-# for n in $(seq $nj); do
-#   cat $output_dir/utt2dur.${name}.$n || exit 1;
-# done > ${data_out}/utt2dur || exit 1
-
 echo "$0: Succeeded packing RIRs for $name"
diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
index 8321169f..aed40672 100755
--- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
+++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh
@@ -92,12 +92,14 @@ fi
 
 $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \
     hyp_utils/conda_env.sh \
-    preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \
+    preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \
     --write-time-durs $output_dir/utt2dur.${name}.JOB \
     --part-idx JOB --num-parts $nj \
-    --input $data_in/wav.scp \
+    # --input $data_in/wav.scp \
+    --recordings-file $data_in/wav.scp \
     --output-path $output_dir \
-    --output-script $output_dir/wav.${name}.JOB.scp
+    --output-recordings-file $output_dir/wav.${name}.JOB.scp
+    #--output-script $output_dir/wav.${name}.JOB.scp
 
 for n in $(seq $nj); do
   cat $output_dir/wav.${name}.$n.scp || exit 1;
diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py
new file mode 100755
index 00000000..1a740024
--- /dev/null
+++ b/hyperion/bin/eval_cosine_scoring_backend.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+"""
+  Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
+  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
+
+"""
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+from pathlib import Path
+
+import numpy as np
+
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet
+from hyperion.utils.math_funcs import cosine_scoring
+from hyperion.io import RandomAccessDataReaderFactory as DRF
+from hyperion.np.transforms import TransformList
+from hyperion.np.score_norm import AdaptSNorm
+
+
+def load_trial_data(
+    enroll_map_file,
+    ndx_file,
+    enroll_feats_file,
+    feats_file,
+    enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts,
+):
+    test_feats_reader = DRF.create(feats_file)
+    if enroll_feats_file is not None and enroll_feats_file != feats_file:
+        enroll_feats_reader = DRF.create(enroll_feats_file)
+    else:
+        enroll_feats_reader = test_feats_reader
+
+    enroll_map = EnrollmentMap.load(enroll_map_file)
+    try:
+        ndx = TrialNdx.load(ndx_file)
+    except:
+        ndx = TrialKey.load(ndx_file).to_ndx()
+
+    if num_enroll_parts > 1 or num_test_parts > 1:
+        ndx = ndx.split(
+            enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts
+        )
+
+    enroll_map = enroll_map.filter(items=ndx.model_set)
+    x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True)
+    x_t = test_feats_reader.read(ndx.seg_set, squeeze=True)
+    return enroll_map, ndx, x_e, x_t
+
+
+def load_cohort_data(segments_file, feats_file):
+
+    segments = SegmentSet.load(segments_file)
+    feats_reader = DRF.create(feats_file)
+    x = feats_reader.read(segments["id"], squeeze=True)
+    return segments, x
+
+
+def eval_backend(
+    enroll_map_file,
+    ndx_file,
+    enroll_feats_file,
+    feats_file,
+    preproc_file,
+    score_file,
+    enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts,
+    cohort_segments_file,
+    cohort_feats_file,
+    cohort_nbest,
+    avg_cohort_by,
+):
+
+    logging.info("loading data")
+    enroll_map, ndx, x_e, x_t = load_trial_data(
+        enroll_map_file,
+        ndx_file,
+        enroll_feats_file,
+        feats_file,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
+    enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True)
+
+    t1 = time.time()
+    logging.info("computing score")
+    if preproc_file is not None:
+        preprocessor = TransformList.load(preproc_file)
+        x_e = preprocessor(x_e)
+        x_t = preprocessor(x_t)
+
+    scores = cosine_scoring(x_e, x_t, ids1=enroll_ids)
+    dt = time.time() - t1
+    num_trials = scores.shape[0] * scores.shape[1]
+    logging.info(
+        "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.",
+        dt,
+        dt / num_trials * 1000,
+    )
+
+    if cohort_segments_file is not None:
+        t1 = time.time()
+        cohort_segments, x_coh = load_cohort_data(
+            cohort_segments_file, cohort_feats_file
+        )
+        if preproc_file is not None:
+            x_coh = preprocessor(x_coh)
+
+        if avg_cohort_by is not None:
+            cohort_class = cohort_segments[avg_cohort_by]
+            _, cohort_ids = np.unique(cohort_class, return_inverse=True)
+        else:
+            cohort_ids = None
+
+        logging.info("computing enroll vs cohort")
+        scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids)
+        logging.info("computing cohort vs test")
+        scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids)
+        snorm = AdaptSNorm(cohort_nbest)
+        scores = snorm(scores, scores_coh_test, scores_enr_coh)
+        dt = time.time() - t1
+        logging.info(
+            "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.",
+            dt,
+            dt / num_trials * 1000,
+        )
+
+    if num_enroll_parts > 1 or num_test_parts > 1:
+        score_file = Path(score_file)
+        new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}"
+        score_file = score_file.with_suffix(new_suffix)
+
+    logging.info("saving scores to %s", score_file)
+    # sort scores rows to match the ndx model_set order
+    sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set]
+    scores = scores[sort_idx]
+    scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
+    scores.save(score_file)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm")
+
+    parser.add_argument("--enroll-feats-file", default=None)
+    parser.add_argument("--feats-file", required=True)
+    parser.add_argument("--ndx-file", required=True)
+    parser.add_argument("--enroll-map-file", required=True)
+    parser.add_argument("--preproc-file", default=None)
+    parser.add_argument("--cohort-segments-file", default=None)
+    parser.add_argument("--cohort-feats-file", default=None)
+    parser.add_argument("--cohort-nbest", type=int, default=1000)
+    parser.add_argument(
+        "--avg-cohort-by",
+        default=None,
+        help="segments file column to average vectors from same class class",
+    )
+    parser.add_argument("--score-file", required=True)
+    parser.add_argument(
+        "--enroll-part-idx", default=1, type=int, help="enroll part index"
+    )
+    parser.add_argument(
+        "--num-enroll-parts",
+        default=1,
+        type=int,
+        help="""number of parts in which we divide the enroll
+                list to run evaluation in parallel""",
+    )
+    parser.add_argument("--test-part-idx", default=1, type=int, help="test part index")
+    parser.add_argument(
+        "--num-test-parts",
+        default=1,
+        type=int,
+        help="""number of parts in which we divide the test list
+                to run evaluation in parallel""",
+    )
+
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    eval_backend(**namespace_to_dict(args))
diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
new file mode 100755
index 00000000..f567dd81
--- /dev/null
+++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
@@ -0,0 +1,472 @@
+#!/usr/bin/env python
+"""
+  Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
+  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
+
+"""
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import (
+    TrialNdx,
+    TrialKey,
+    TrialScores,
+    EnrollmentMap,
+    SegmentSet,
+    InfoTable,
+)
+from hyperion.utils.math_funcs import cosine_scoring, average_vectors
+from hyperion.io import RandomAccessDataReaderFactory as DRF
+from hyperion.np.transforms import TransformList
+from hyperion.np.score_norm import AdaptSNorm
+
+
+def get_precomp_qm_names(quality_measures):
+    # snorm qm will be calculated later
+    return [q for q in quality_measures if q not in ["snorm-mu", "snorm-mu/s"]]
+
+
+def normalize_duration(q, min_dur, max_dur, frame_rate):
+
+    q = q / frame_rate
+    q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur))
+    log_min_dur = np.log(min_dur)
+    log_max_dur = np.log(max_dur)
+    q = (q - log_min_dur) / (log_max_dur - log_min_dur)
+    return q
+
+
+def load_trial_data(
+    enroll_map_file,
+    ndx_file,
+    enroll_feats_file,
+    feats_file,
+    enroll_segments_file,
+    segments_file,
+    quality_measures,
+    min_dur,
+    max_dur,
+    frame_rate,
+    enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts,
+):
+    test_feats_reader = DRF.create(feats_file)
+    if enroll_feats_file is not None and enroll_feats_file != feats_file:
+        enroll_feats_reader = DRF.create(enroll_feats_file)
+    else:
+        enroll_feats_reader = test_feats_reader
+
+    enroll_map = EnrollmentMap.load(enroll_map_file)
+    try:
+        ndx = TrialNdx.load(ndx_file)
+    except:
+        ndx = TrialKey.load(ndx_file).to_ndx()
+
+    if num_enroll_parts > 1 or num_test_parts > 1:
+        ndx = ndx.split(
+            enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts
+        )
+
+    enroll_map = enroll_map.filter(items=ndx.model_set)
+    x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True)
+    x_t = test_feats_reader.read(ndx.seg_set, squeeze=True)
+
+    # quality measures may be in segments file or/and feature_set file
+    # so we combine both if both are given
+    if segments_file is not None:
+        test_segments = SegmentSet.load(segments_file)
+        if enroll_segments_file is not None and segments_file != enroll_segments_file:
+            enroll_segments = SegmentSet.load(enroll_segments_file)
+        else:
+            enroll_segments = test_segments
+
+    test_feats_set = test_feats_reader.feature_set
+    enroll_feats_set = enroll_feats_reader.feature_set
+    if segments_file:
+        test_segments.add_columns(test_feats_set)
+        if enroll_feats_set != test_feats_set or enroll_segments != test_segments:
+            enroll_segments.add_columns(enroll_feats_set)
+
+    # now we retrive the quality measures
+    q_e = []
+    q_t = []
+    # snorm qm will be calculated later
+    retrieve_qm = get_precomp_qm_names(quality_measures)
+    q_e = enroll_segments.loc[enroll_map["segmentid"], retrieve_qm]
+    q_t = test_segments.loc[ndx.seg_set, retrieve_qm]
+
+    # normalize durations
+    if "speech_duration" in retrieve_qm:
+        q_e["speech_duration"] = normalize_duration(
+            q_e["speech_duration"], min_dur, max_dur, 1
+        )
+        q_t["speech_duration"] = normalize_duration(
+            q_t["speech_duration"], min_dur, max_dur, 1
+        )
+
+    if "num_speech_frames" in retrieve_qm:
+        q_e["num_speech_frames"] = normalize_duration(
+            q_e["num_speech_frames"], min_dur, max_dur, frame_rate
+        )
+        q_t["num_speech_frames"] = normalize_duration(
+            q_t["num_speech_frames"], min_dur, max_dur, frame_rate
+        )
+
+    # q_e = np.asarray(q_e)
+    # q_t = np.asarray(q_t)
+
+    return enroll_map, ndx, x_e, x_t, q_e, q_t
+
+
+def load_cohort_data(segments_file, feats_file):
+
+    segments = SegmentSet.load(segments_file)
+    feats_reader = DRF.create(feats_file)
+    x = feats_reader.read(segments["id"], squeeze=True)
+
+    # segments.add_columns(feats_reader.feature_set)
+
+    # retrieve_qm = get_precomp_qm_names(quality_measures)
+    # q = np.asarray(segments[retrieve_qm])
+    return segments, x  # , q
+
+
+def average_qm(q, model_set, ids):
+    q_avg = average_vectors(q.values, ids)
+    q_avg = pd.DataFrame(q, columns=q.columns)
+    q_avg["id"] = model_set
+    q_avg.set_index("id", drop=False, inplace=True)
+    return q_avg
+
+
+def get_score_filepath(
+    score_file,
+    score_name,
+    enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts,
+):
+
+    score_file = Path(score_file)
+    new_suffix = ""
+    if score_name is not None:
+        new_suffix = f".{score_name}"
+
+    if num_enroll_parts > 1 or num_test_parts > 1:
+        new_suffix = (
+            f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}"
+        )
+
+    if new_suffix:
+        new_suffix = f"{new_suffix}{score_file.suffix}"
+        score_file = score_file.with_suffix(new_suffix)
+
+    return score_file
+
+def save_scores(ndx, scores, score_file, score_name,     enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts):
+
+def save_empty_scores(ndx, score_file, score_name,     enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts):
+    scores = np.zeros(ndx.trial_mask.shape, dtype="float32")
+    score_file = get_score_filepath(score_file, score_name,enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts)
+
+    scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
+    scores.save(score_file)
+
+
+
+
+def segment_to_trial_qm(q_e, q_t):
+    q_trial = {}
+    for q_name in ["speech_duration", "num_speech_frames"]:
+        if q_name in q_e:
+            q_trial_name = f"max_{q_name}"
+            q_trial[q_trial_name] = np.maximum(
+                q_e[q_name].values[:, None], q_t[q_name].values[None, :]
+            )
+            q_trial_name = f"min_{q_name}"
+            q_trial[q_trial_name] = np.minimum(
+                q_e[q_name].values[:, None], q_t[q_name].values[None, :]
+            )
+
+    return q_trial
+
+
+def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial):
+    # sort scores rows to match the ndx model_set order
+    sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set]
+    scores = scores[sort_idx]
+    if scores_norm is not None:
+        scores_norm = scores_norm[sort_idx]
+    for qm in q_trial:
+        q_trial[qm] = q_trial[qm][sort_idx]
+
+    return scores, scores_norm, q_trial
+
+
+def make_qm_table(ndx, scores, scores_norm, q_trial):
+    if scores_norm is None:
+        scores = scores[ndx.trial_mask]
+    else:
+        scores = scores_norm[ndx.trial_mask]
+
+    for qm in q_trial:
+        q_trial[qm] = q_trial[qm][ndx.trial_mask]
+
+    I, J = np.nonzero(ndx.trial_mask)
+    modelid = ndx.model_set[I]
+    segmentid = ndx.seg_set[J]
+    unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)]
+
+    q_dict = {
+        "id": unique_id,
+        "modelid": modelid,
+        "segmentid": segmentid,
+        "scores": scores,
+    }
+    q_dict.update(q_trial)
+    df = pd.DataFrame(q_dict)
+    return InfoTable(df)
+
+
+
+
+def eval_backend(
+    enroll_map_file,
+    ndx_file,
+    enroll_feats_file,
+    feats_file,
+    enroll_segments_file,
+    segments_file,
+    preproc_file,
+    qmf_file,
+    quality_measures,
+    min_dur,
+    max_dur,
+    frame_rate,
+    cohort_segments_file,
+    cohort_feats_file,
+    cohort_nbest,
+    avg_cohort_by,
+    score_file,
+    enroll_part_idx,
+    num_enroll_parts,
+    test_part_idx,
+    num_test_parts,
+):
+
+    logging.info("loading data")
+    enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data(
+        enroll_map_file,
+        ndx_file,
+        enroll_feats_file,
+        feats_file,
+        enroll_segments_file,
+        segments_file,
+        quality_measures,
+        min_dur,
+        max_dur,
+        frame_rate,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
+
+    if not np.any(ndx.trial_mask):
+        # this part doesn't have any trials, save empty files
+        
+    
+    enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True)
+    q_e = average_qm(q_e, enroll_set, enroll_ids)
+
+    t1 = time.time()
+    logging.info("computing score")
+    if preproc_file is not None:
+        preprocessor = TransformList.load(preproc_file)
+        x_e = preprocessor(x_e)
+        x_t = preprocessor(x_t)
+
+    scores = cosine_scoring(x_e, x_t, ids1=enroll_ids)
+    dt = time.time() - t1
+    num_trials = scores.shape[0] * scores.shape[1]
+    logging.info(
+        "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.",
+        dt,
+        dt / num_trials * 1000,
+    )
+
+    q_trial = segment_to_trial_qm(q_e, q_t)
+    scores_norm = None
+    if cohort_segments_file is not None:
+        t1 = time.time()
+        cohort_segments, x_coh = load_cohort_data(
+            cohort_segments_file, cohort_feats_file
+        )
+        if preproc_file is not None:
+            x_coh = preprocessor(x_coh)
+
+        if avg_cohort_by is not None:
+            cohort_class = cohort_segments[avg_cohort_by]
+            _, cohort_ids = np.unique(cohort_class, return_inverse=True)
+        else:
+            cohort_ids = None
+
+        logging.info("computing enroll vs cohort")
+        scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids)
+        logging.info("computing cohort vs test")
+        scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids)
+        snorm = AdaptSNorm(cohort_nbest)
+        scores_norm, mu_z, s_z, mu_t, s_t = snorm(
+            scores, scores_coh_test, scores_enr_coh, return_stats=True
+        )
+        if "snorm-mu" in quality_measures:
+            q_trial["max_snorm-mu"] = np.maximum(mu_z, mu_t)
+            q_trial["min_snorm-mu"] = np.minimum(mu_z, mu_t)
+        if "snorm-mu/s" in quality_measures:
+            mu_z = mu_z / s_z
+            mu_t = mu_t / s_t
+            q_trial["max_snorm-mu/s"] = np.maximum(mu_z, mu_t)
+            q_trial["min_snorm-mu/s"] = np.minimum(mu_z, mu_t)
+
+        dt = time.time() - t1
+        logging.info(
+            "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.",
+            dt,
+            dt / num_trials * 1000,
+        )
+
+    scores, scores_norm, q_trial = align_scores_to_ndx(
+        enroll_set, ndx, scores, scores_norm, q_trial
+    )
+    if qmf_file is None:
+        qm_table = make_qm_table(ndx, scores, scores_norm, q_trial)
+        qm_file = get_score_filepath(
+            score_file,
+            "qm",
+            enroll_part_idx,
+            num_enroll_parts,
+            test_part_idx,
+            num_test_parts,
+        )
+        qm_table.save(qm_file)
+        return
+
+    score_file_nonorm = get_score_filepath(
+        score_file,
+        None,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
+    logging.info("saving scores to %s", score_file_nonorm)
+    scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
+    scores.save(score_file_nonorm)
+
+    if scores_norm is not None:
+        score_file_snorm = get_score_filepath(
+            score_file,
+            "snorm",
+            enroll_part_idx,
+            num_enroll_parts,
+            test_part_idx,
+            num_test_parts,
+        )
+        logging.info("saving scores with AS-Norm to %s", score_file_snorm)
+        scores.scores = scores_norm
+        scores.save(score_file_snorm)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description="Eval cosine-scoring with optional AS-Norm and QMF"
+    )
+
+    parser.add_argument("--enroll-feats-file", default=None)
+    parser.add_argument("--feats-file", required=True)
+    parser.add_argument("--ndx-file", required=True)
+    parser.add_argument("--enroll-map-file", required=True)
+    parser.add_argument("--enroll-segments-file", default=None)
+    parser.add_argument("--segments-file", default=None)
+    parser.add_argument("--preproc-file", default=None)
+    parser.add_argument("--qmf-file", default=None)
+    parser.add_argument(
+        "--quality-measures",
+        default=["snorm-mu/s", "speech_duration"],
+        nargs="+",
+        choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"],
+    )
+    parser.add_argument(
+        "--min-dur", default=0.1, type=float, help="lower bound to clip durations"
+    )
+    parser.add_argument(
+        "--max-dur", default=30.0, type=float, help="upper bound to clip durations"
+    )
+    parser.add_argument(
+        "--frame-rate",
+        default=100,
+        type=float,
+        help="frames/sec when durationa are expressed in frames",
+    )
+    parser.add_argument("--cohort-segments-file", default=None)
+    parser.add_argument("--cohort-feats-file", default=None)
+    parser.add_argument("--cohort-nbest", type=int, default=1000)
+    parser.add_argument(
+        "--avg-cohort-by",
+        default=None,
+        help="segments file column to average vectors from same class class",
+    )
+    parser.add_argument("--score-file", required=True)
+    parser.add_argument(
+        "--enroll-part-idx", default=1, type=int, help="enroll part index"
+    )
+    parser.add_argument(
+        "--num-enroll-parts",
+        default=1,
+        type=int,
+        help="""number of parts in which we divide the enroll
+                list to run evaluation in parallel""",
+    )
+    parser.add_argument("--test-part-idx", default=1, type=int, help="test part index")
+    parser.add_argument(
+        "--num-test-parts",
+        default=1,
+        type=int,
+        help="""number of parts in which we divide the test list
+                to run evaluation in parallel""",
+    )
+
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    eval_backend(**namespace_to_dict(args))
diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py
new file mode 100755
index 00000000..83227558
--- /dev/null
+++ b/hyperion/bin/eval_verification_metrics.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from pathlib import Path
+import pandas as pd
+
+from hyperion.hyp_defs import config_logger
+from hyperion.np.metrics import VerificationEvaluator as VE
+
+from jsonargparse import (
+    ActionConfigFile,
+    ActionYesNo,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+
+def eval_verification_metrics(
+    key_files,
+    score_files,
+    key_names,
+    score_names,
+    p_tar,
+    c_miss,
+    c_fa,
+    sparse,
+    output_file,
+):
+
+    assert len(key_files) == len(key_names)
+    assert len(score_files) == len(score_names)
+    dfs = []
+    for score_file, score_name in zip(score_files, score_names):
+        for key_file, key_name in zip(key_files, key_names):
+            logging.info("Evaluating %s - %s", score_name, key_name)
+            evaluator = VE(
+                key_file,
+                score_file,
+                p_tar,
+                c_miss,
+                c_fa,
+                key_name,
+                score_name,
+                sparse=sparse,
+            )
+            df_ij = evaluator.compute_dcf_eer()
+            dfs.append(df_ij)
+
+    df = pd.concat(dfs)
+    logging.info("saving results to %s", output_file)
+    output_file = Path(output_file)
+    output_file.parent.mkdir(exist_ok=True, parents=True)
+    sep = "\t" if output_file.suffix == ".tsv" else ","
+    df.to_csv(output_file, sep=sep, index=False, float_format="{:,.4f}".format)
+
+    pd.options.display.float_format = "{:.4}".format
+    print(df.to_string(), flush=True)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Evaluate speaker verification metrics")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument("--key-files", required=True, nargs="+")
+    parser.add_argument("--score-files", required=True, nargs="+")
+    parser.add_argument("--key-names", required=True, nargs="+")
+    parser.add_argument("--score-names", required=True, nargs="+")
+    parser.add_argument(
+        "--p-tar",
+        default=[0.05, 0.01, 0.005, 0.001],
+        nargs="+",
+        type=float,
+        help="target priors",
+    )
+    parser.add_argument(
+        "--c-miss", default=None, nargs="+", type=float, help="cost of miss"
+    )
+    parser.add_argument(
+        "--c-fa", default=None, nargs="+", type=float, help="cost of false alarm"
+    )
+    parser.add_argument("--sparse", default=False, action=ActionYesNo)
+    parser.add_argument("--output-file", required=True)
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
+    )
+
+    args = parser.parse_args()
+    kwargs = namespace_to_dict(args)
+    config_logger(kwargs["verbose"])
+    del kwargs["verbose"]
+    del kwargs["cfg"]
+    eval_verification_metrics(**kwargs)
diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py
index 9efbd6dd..f60c7508 100755
--- a/hyperion/bin/eval_xvec_logits_from_wav.py
+++ b/hyperion/bin/eval_xvec_logits_from_wav.py
@@ -21,8 +21,12 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def init_device(use_gpu):
@@ -76,13 +80,15 @@ def augment(key0, x0, augmenter, aug_df, aug_id):
 
 
 def select_random_chunk(key, x, min_utt_length, max_utt_length, rng):
-    utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1)
+    utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1)
     if utt_length < x.shape[1]:
-        first_frame = rng.randint(low=0, high=x.shape[1] - utt_length)
+        first_frame = rng.integers(low=0, high=x.shape[1] - utt_length)
         x = x[:, first_frame : first_frame + utt_length]
         logging.info(
-            "extract-random-utt %s of length=%d first-frame=%d"
-            % (key, x.shape[1], first_frame)
+            "extract-random-utt %s of length=%d first-frame=%d",
+            key,
+            x.shape[1],
+            first_frame,
         )
     return x
 
@@ -105,7 +111,7 @@ def eval_xvec(
     **kwargs
 ):
 
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
     model = load_model(model_path, device)
diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py
index 6f7d269e..5eba1b99 100755
--- a/hyperion/bin/extract_wav2vec2xvectors.py
+++ b/hyperion/bin/extract_wav2vec2xvectors.py
@@ -21,8 +21,12 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 resamplers = {}
 
@@ -84,9 +88,11 @@ def augment(key0, x0, augmenter, aug_df, aug_id):
 
 
 def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng):
-    utt_length = rng.randint(low=fs * min_utt_length, high=fs * max_utt_length + 1)
+    utt_length = rng.integers(
+        low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1)
+    )
     if utt_length < x.shape[1]:
-        first_frame = rng.randint(low=0, high=x.shape[1] - utt_length)
+        first_frame = rng.integers(low=0, high=x.shape[1] - utt_length)
         x = x[:, first_frame : first_frame + utt_length]
         logging.info(
             "extract-random-utt %s of length=%d first-frame=%d",
@@ -98,7 +104,7 @@ def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng):
 
 
 def extract_xvectors(
-    input_spec,
+    recordings_file,
     output_spec,
     vad_spec,
     write_speech_dur,
@@ -117,7 +123,7 @@ def extract_xvectors(
     **kwargs,
 ):
 
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     model = load_model(model_path, device)
 
@@ -138,15 +144,12 @@ def extract_xvectors(
     logging.info("opening output stream: %s", output_spec)
     with DWF.create(output_spec) as writer:
 
-        logging.info(f"opening input stream: {input_spec} with args={ar_args}")
-        with AR(input_spec, **ar_args) as reader:
+        logging.info(f"opening input stream: {recordings_file} with args={ar_args}")
+        with AR(recordings_file, **ar_args) as reader:
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(
-                    vad_spec,
-                    path_prefix=vad_path_prefix,
-                )
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
 
             while not reader.eof():
                 t1 = time.time()
@@ -160,9 +163,7 @@ def extract_xvectors(
                 t2 = time.time()
                 if fs != model.sample_frequency:
                     resampler = get_resampler(fs, model.sample_frequency)
-                    print(f"x01 {x0.shape} {np.max(x0)}")
                     x0 = resampler(x0)
-                    print(f"x01 {x0.shape} {np.max(x0)}")
 
                 logging.info("processing utt %s", key0)
                 for aug_id in range(num_augs):
@@ -260,7 +261,7 @@ def extract_xvectors(
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--input", dest="input_spec", required=True)
+    parser.add_argument("--recordings-file", required=True)
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument("--write-speech-dur", default=None)
     parser.add_argument(
@@ -278,7 +279,7 @@ def extract_xvectors(
     parser.add_argument("--model-path", required=True)
     parser.add_argument(
         "--hf-chunk-length",
-        type=int,
+        type=float,
         default=0,
         help=(
             "max. chunk length used in each forward pass "
@@ -288,7 +289,7 @@ def extract_xvectors(
     )
     parser.add_argument(
         "--xvec-chunk-length",
-        type=int,
+        type=float,
         default=0,
         help=(
             "max. chunk length used in each forward pass "
@@ -314,18 +315,18 @@ def extract_xvectors(
     )
     parser.add_argument(
         "--min-utt-length",
-        type=int,
+        type=float,
         default=5,
         help=("minimum utterance length in secs when using random utt length"),
     )
     parser.add_argument(
         "--max-utt-length",
-        type=int,
+        type=float,
         default=120,
         help=("maximum utterance length in secs when using random utt length"),
     )
 
-    parser.add_argument("--output", dest="output_spec", required=True)
+    parser.add_argument("--output-spec", required=True)
     parser.add_argument(
         "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu"
     )
diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py
new file mode 100755
index 00000000..7b04fcc8
--- /dev/null
+++ b/hyperion/bin/extract_wav2xvectors.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python
+"""
+ Copyright 2019 Jesus Villalba (Johns Hopkins University)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0) 
+"""
+
+import logging
+import os
+import sys
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+import torchaudio.transforms as tat
+from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
+from hyperion.io import DataWriterFactory as DWF
+from hyperion.io import SequentialAudioReader as AR
+from hyperion.io import VADReaderFactory as VRF
+from hyperion.np.augment import SpeechAugment
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.utils import open_device
+from hyperion.utils import Utt2Info
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+resamplers = {}
+
+
+def get_resampler(source_fs, target_fs):
+    if source_fs in resamplers:
+        return resamplers[source_fs]
+
+    resampler = tat.Resample(
+        int(source_fs),
+        int(target_fs),
+        lowpass_filter_width=64,
+        rolloff=0.9475937167399596,
+        resampling_method="kaiser_window",
+        beta=14.769656459379492,
+    )
+    resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy()
+    resamplers[source_fs] = resampler_f
+    return resampler_f
+
+
+def init_device(use_gpu):
+    set_float_cpu("float32")
+    num_gpus = 1 if use_gpu else 0
+    logging.info("initializing devices num_gpus=%d", num_gpus)
+    device = open_device(num_gpus=num_gpus)
+    return device
+
+
+def load_model(model_path, device):
+    logging.info("loading model %s", model_path)
+    model = TML.load(model_path)
+    logging.info(f"xvector-model={model}")
+    model.to(device)
+    model.eval()
+    return model
+
+
+def augment(key0, x0, augmenter, aug_df, aug_id):
+    if augmenter is None:
+        x = x0
+        key = key0
+    else:
+        x, aug_info = augmenter(x0)
+        key = "%s-aug-%02d" % (key0, aug_id)
+        aug_df_row = {
+            "key_aug": key,
+            "key_orig": key0,
+            "noise_type": aug_info["noise"]["noise_type"],
+            "snr": aug_info["noise"]["snr"],
+            "rir_type": aug_info["reverb"]["rir_type"],
+            "srr": aug_info["reverb"]["srr"],
+            "sdr": aug_info["sdr"],
+        }
+
+        aug_df.append(pd.DataFrame(aug_df_row, index=[0]))
+
+    return key, x
+
+
+def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng):
+    utt_length = rng.integers(
+        low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1)
+    )
+    if utt_length < x.shape[1]:
+        first_frame = rng.integers(low=0, high=x.shape[1] - utt_length)
+        x = x[:, first_frame : first_frame + utt_length]
+        logging.info(
+            "extract-random-utt %s of length=%d first-frame=%d",
+            key,
+            x.shape[1],
+            first_frame,
+        )
+    return x
+
+
+def extract_xvectors(
+    recordings_file,
+    output_spec,
+    vad_spec,
+    write_speech_dur,
+    vad_path_prefix,
+    model_path,
+    chunk_length,
+    embed_layer,
+    random_utt_length,
+    min_utt_length,
+    max_utt_length,
+    aug_cfg,
+    num_augs,
+    aug_info_path,
+    use_gpu,
+    **kwargs,
+):
+
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
+    device = init_device(use_gpu)
+    model = load_model(model_path, device)
+
+    if write_speech_dur is not None:
+        keys = []
+        info = []
+
+    if aug_cfg is not None:
+        augmenter = SpeechAugment.create(aug_cfg, rng=rng)
+        aug_df = []
+    else:
+        augmenter = None
+        aug_df = None
+        num_augs = 1
+
+    metadata_columns = ["speech_duration"]
+
+    ar_args = AR.filter_args(**kwargs)
+    logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args))
+    with DWF.create(output_spec, metadata_columns=metadata_columns) as writer:
+
+        logging.info(f"opening input stream: {recordings_file} with args={ar_args}")
+        with AR(recordings_file, **ar_args) as reader:
+
+            if vad_spec is not None:
+                logging.info("opening VAD stream: %s", vad_spec)
+                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
+
+            while not reader.eof():
+                t1 = time.time()
+                key, x0, fs = reader.read(1)
+                if len(key) == 0:
+                    break
+
+                x0 = x0[0]
+                key0 = key[0]
+                fs = fs[0]
+                t2 = time.time()
+                if fs != model.sample_frequency:
+                    resampler = get_resampler(fs, model.sample_frequency)
+                    x0 = resampler(x0)
+
+                logging.info("processing utt %s", key0)
+                for aug_id in range(num_augs):
+                    metadata = {}
+                    t3 = time.time()
+                    key, x = augment(key0, x0, augmenter, aug_df, aug_id)
+                    t4 = time.time()
+                    with torch.no_grad():
+                        x = torch.tensor(
+                            x[None, :], dtype=torch.get_default_dtype()
+                        ).to(device)
+                        t5 = time.time()
+                        tot_samples = x.shape[1]
+                        if vad_spec is not None:
+                            vad = v_reader.read(key0)[0]
+                            vad = torch.tensor(
+                                vad[None, None, :], dtype=torch.float
+                            ).to(device)
+                            vad = torch.nn.functional.interpolate(
+                                vad, size=x.size(-1), mode="nearest"
+                            ).bool()[0, 0]
+                            x = x[:, vad]
+
+                        logging.info(
+                            "utt %s detected %d/%d (%.2f %%) speech samples",
+                            key,
+                            x.shape[1],
+                            tot_samples,
+                            x.shape[1] / tot_samples * 100,
+                        )
+
+                        if random_utt_length:
+                            x = select_random_chunk(
+                                key, x, fs, min_utt_length, max_utt_length, rng
+                            )
+
+                        metadata["speech_duration"] = (
+                            x.shape[1] / model.sample_frequency
+                        )
+
+                        t6 = time.time()
+                        if x.shape[1] == 0:
+                            y = np.zeros((model.embed_dim,), dtype=float_cpu())
+                        else:
+                            y = (
+                                model.extract_embed(
+                                    x,
+                                    chunk_length=chunk_length,
+                                    embed_layer=embed_layer,
+                                )
+                                .cpu()
+                                .numpy()[0]
+                            )
+
+                    t7 = time.time()
+                    writer.write([key], [y], metadata=metadata)
+                    if write_speech_dur is not None:
+                        keys.append(key)
+                        info.append(str(x.shape[1] / fs))
+
+                    t8 = time.time()
+                    read_time = t2 - t1
+                    tot_time = read_time + t8 - t3
+                    logging.info(
+                        (
+                            "utt %s total-time=%.3f read-time=%.3f "
+                            "aug-time=%.3f feat-time=%.3f "
+                            "vad-time=%.3f embed-time=%.3f write-time=%.3f "
+                            "rt-factor=%.2f"
+                        ),
+                        key,
+                        tot_time,
+                        read_time,
+                        t4 - t3,
+                        t5 - t4,
+                        t6 - t5,
+                        t7 - t6,
+                        t8 - t7,
+                        x.shape[1] / fs / tot_time,
+                    )
+
+    if write_speech_dur is not None:
+        logging.info("writing speech duration in secs to %s", write_speech_dur)
+        u2sd = Utt2Info.create(keys, info)
+        u2sd.save(write_speech_dur)
+
+    if aug_info_path is not None:
+        aug_df = pd.concat(aug_df, ignore_index=True)
+        aug_df.to_csv(aug_info_path, index=False, na_rep="n/a")
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description="""Extracts x-vectors from waveform computing acoustic features on the fly"""
+    )
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument("--recordings-file", required=True)
+    parser.add_argument("--vad", dest="vad_spec", default=None)
+    parser.add_argument("--write-speech-dur", default=None)
+    parser.add_argument(
+        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad")
+    )
+
+    AR.add_class_args(parser)
+
+    parser.add_argument("--aug-cfg", default=None)
+    parser.add_argument("--aug-info-path", default=None)
+    parser.add_argument(
+        "--num-augs", default=1, type=int, help="number of augmentations per utterance"
+    )
+
+    parser.add_argument("--model-path", required=True)
+    parser.add_argument(
+        "--chunk-length",
+        type=float,
+        default=0,
+        help=(
+            "max. chunk length used in each forward pass "
+            "of the x-vector encoder,"
+            "if 0 the full utterance is used"
+        ),
+    )
+    parser.add_argument(
+        "--embed-layer",
+        type=int,
+        default=None,
+        help=(
+            "classifier layer to get the embedding from, "
+            "if None, it uses layer set in training phase"
+        ),
+    )
+
+    parser.add_argument(
+        "--random-utt-length",
+        default=False,
+        action="store_true",
+        help="calculates x-vector from a random chunk",
+    )
+    parser.add_argument(
+        "--min-utt-length",
+        type=float,
+        default=5,
+        help=("minimum utterance length in secs when using random utt length"),
+    )
+    parser.add_argument(
+        "--max-utt-length",
+        type=float,
+        default=120,
+        help=("maximum utterance length in secs when using random utt length"),
+    )
+
+    parser.add_argument("--output-spec", required=True)
+    parser.add_argument(
+        "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu"
+    )
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    extract_xvectors(**namespace_to_dict(args))
diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py
index 13ad4277..b02db70c 100755
--- a/hyperion/bin/extract_xvectors_from_feats.py
+++ b/hyperion/bin/extract_xvectors_from_feats.py
@@ -19,8 +19,12 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def init_device(use_gpu):
@@ -50,13 +54,15 @@ def load_model(model_path, device):
 
 
 def select_random_chunk(key, x, min_utt_length, max_utt_length, rng):
-    utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1)
+    utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1)
     if utt_length < x.shape[1]:
-        first_frame = rng.randint(low=0, high=x.shape[1] - utt_length)
+        first_frame = rng.integers(low=0, high=x.shape[1] - utt_length)
         x = x[:, first_frame : first_frame + utt_length]
         logging.info(
-            "extract-random-utt %s of length=%d first-frame=%d"
-            % (key, x.shape[1], first_frame)
+            "extract-random-utt %s of length=%d first-frame=%d",
+            key,
+            x.shape[1],
+            first_frame,
         )
     return x
 
@@ -78,7 +84,7 @@ def extract_xvectors(
 ):
 
     logging.info("initializing")
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     mvn = init_mvn(device, **kwargs)
     model = load_model(model_path, device)
diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py
index 577bbae7..6a8130d3 100755
--- a/hyperion/bin/extract_xvectors_from_wav.py
+++ b/hyperion/bin/extract_xvectors_from_wav.py
@@ -21,8 +21,12 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def init_device(use_gpu):
@@ -76,9 +80,9 @@ def augment(key0, x0, augmenter, aug_df, aug_id):
 
 
 def select_random_chunk(key, x, min_utt_length, max_utt_length, rng):
-    utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1)
+    utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1)
     if utt_length < x.shape[1]:
-        first_frame = rng.randint(low=0, high=x.shape[1] - utt_length)
+        first_frame = rng.integers(low=0, high=x.shape[1] - utt_length)
         x = x[:, first_frame : first_frame + utt_length]
         logging.info(
             "extract-random-utt %s of length=%d first-frame=%d",
@@ -90,7 +94,7 @@ def select_random_chunk(key, x, min_utt_length, max_utt_length, rng):
 
 
 def extract_xvectors(
-    input_spec,
+    recordings_file,
     output_spec,
     vad_spec,
     write_num_frames_spec,
@@ -108,7 +112,7 @@ def extract_xvectors(
     **kwargs
 ):
 
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
     model = load_model(model_path, device)
@@ -130,9 +134,9 @@ def extract_xvectors(
     with DWF.create(output_spec) as writer:
 
         logging.info(
-            "opening input stream: {} with args={}".format(input_spec, ar_args)
+            "opening input stream: {} with args={}".format(recordings_file, ar_args)
         )
-        with AR(input_spec, **ar_args) as reader:
+        with AR(recordings_file, **ar_args) as reader:
 
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
@@ -235,12 +239,12 @@ def extract_xvectors(
 
     parser = ArgumentParser(
         description=(
-            "Extracts x-vectors from waveform computing " "acoustic features on the fly"
+            "Extracts x-vectors from waveform computing acoustic features on the fly"
         )
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--input", dest="input_spec", required=True)
+    parser.add_argument("--recordings-file", required=True)
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
         "--write-num-frames", dest="write_num_frames_spec", default=None
@@ -299,7 +303,7 @@ def extract_xvectors(
         help=("maximum utterance length when using random utt length"),
     )
 
-    parser.add_argument("--output", dest="output_spec", required=True)
+    parser.add_argument("--output-spec", required=True)
     parser.add_argument(
         "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu"
     )
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
index a54c4d64..bcec5133 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
@@ -20,8 +20,12 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def init_device(use_gpu):
@@ -71,7 +75,7 @@ def extract_xvectors(
 ):
 
     logging.info("initializing")
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     mvn = init_mvn(device, **kwargs)
     model = load_model(model_path, device)
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
index 8939ba91..f1a64e1b 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
@@ -22,8 +22,12 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def init_device(use_gpu):
@@ -96,7 +100,7 @@ def extract_xvectors(
     **kwargs
 ):
 
-    rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"])
+    rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
     model = load_model(model_path, device)
diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py
new file mode 100755
index 00000000..b100b544
--- /dev/null
+++ b/hyperion/bin/finetune_wav2xvector.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+"""
+ Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+import sys
+import time
+from pathlib import Path
+
+import torch
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+
+# from hyperion.torch.models import EfficientNetXVector as EXVec
+from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec
+from hyperion.torch.models import Wav2ResNetXVector as RXVec
+
+# from hyperion.torch.models import SpineNetXVector as SpineXVec
+# from hyperion.torch.models import TDNNXVector as TDXVec
+# from hyperion.torch.models import TransformerXVectorV1 as TFXVec
+# from hyperion.torch.narchs import AudioFeatsMVN as AF
+from hyperion.torch.trainers import XVectorTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+xvec_dict = {
+    "resnet": RXVec,
+    "resnet1d": R1dXVec,
+    # "efficientnet": EXVec,
+    # "tdnn": TDXVec,
+    # "transformer": TFXVec,
+    # "spinenet": SpineXVec,
+}
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+
+    kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**kwargs["dataset"])
+    sampler_args = kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs)
+    return data_loader
+
+
+def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs):
+    xvec_args = xvec_class.filter_finetune_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("xvector network ft args={}".format(xvec_args))
+    xvec_args["xvector"]["num_classes"] = num_classes
+    model = TML.load(in_model_file)
+    model.change_config(**xvec_args)
+    if rank == 0:
+        logging.info("x-vector-model={}".format(model))
+    return model
+
+
+def init_hard_prototype_mining(model, train_loader, val_loader, rank):
+    try:
+        hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining
+    except:
+        hard_prototype_mining = False
+
+    if not hard_prototype_mining:
+        return
+
+    if rank == 0:
+        logging.info("setting hard prototypes")
+
+    affinity_matrix = model.compute_prototype_affinity()
+    train_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
+    try:
+        hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining
+    except:
+        hard_prototype_mining = False
+
+    if not hard_prototype_mining:
+        return
+
+    val_loader.batch_sampler.set_hard_prototypes(affinity_matrix)
+
+
+def train_xvec(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+    model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+    init_hard_prototype_mining(model, train_loader, val_loader, rank)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(xvec_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    train_parser = ArgumentParser(prog="")
+
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+    parser.link_arguments(
+        "data.train.dataset.class_files", "data.val.dataset.class_files"
+    )
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
+
+    xvec_class.add_finetune_args(parser, prefix="model")
+    parser.add_argument("--in-model-file", required=True)
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=xvec_class.valid_train_modes()
+    )
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Fine-tune x-vector model from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+    for k, v in xvec_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    xvec_type = args.subcommand
+    args_sc = vars(args)[xvec_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.xvec_class = xvec_dict[xvec_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_xvec(gpu_id, args_sc)
diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py
index 209915c5..4336b7b9 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_classif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py
@@ -24,8 +24,12 @@
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialNdx, Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def read_utt_list(list_file, class2int_file, part_idx, num_parts):
diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py
old mode 100644
new mode 100755
index c5a3f6b9..2e3a35ec
--- a/hyperion/bin/hyperion_dataset.py
+++ b/hyperion/bin/hyperion_dataset.py
@@ -8,30 +8,40 @@
 from typing import List, Optional, Union
 
 from hyperion.hyp_defs import config_logger
-from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet,
-                            InfoTable, PathLike, RecordingSet, SegmentSet)
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
-subcommands = ["add_features"]
-# table_dict = {
-#     "segments": SegmentSet,
-#     "recordings": RecordingSet,
-#     "features": FeatureSet,
-#     "classes": ClassInfo,
-#     "enrollments": EnrollmentMap,
-#     "generic": InfoTable,
-# }
+from hyperion.utils import (
+    ClassInfo,
+    Dataset,
+    EnrollmentMap,
+    FeatureSet,
+    InfoTable,
+    PathLike,
+    RecordingSet,
+    SegmentSet,
+)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+    ActionYesNo,
+)
+
+subcommand_list = [
+    "add_features",
+    "set_recordings",
+    "make_from_recordings",
+    "remove_short_segments",
+    "rebuild_class_idx",
+    "remove_classes_few_segments",
+    "split_train_val",
+    "copy",
+    "add_cols_to_segments",
+]
 
 
 def add_common_args(parser):
     parser.add_argument(
-        "-v",
-        "--verbose",
-        dest="verbose",
-        default=1,
-        choices=[0, 1, 2, 3],
-        type=int,
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
     )
 
 
@@ -45,6 +55,11 @@ def make_add_features_parser():
         "--features-name", required=True, help="""name of the feature"""
     )
     parser.add_argument("--features-file", required=True, help="""feature set file""")
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
 
     add_common_args(parser)
     return parser
@@ -54,10 +69,353 @@ def add_features(
     dataset: PathLike,
     features_name: str,
     features_file: PathLike,
+    output_dataset: PathLike,
 ):
+    if output_dataset is None:
+        output_dataset = dataset
+
     dataset = Dataset.load(dataset, lazy=True)
     dataset.add_features(features_name, features_file)
-    dataset.save(dataset)
+    dataset.save(output_dataset)
+
+
+def make_set_recordings_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--recordings-file", required=True, help="""recordings set file"""
+    )
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+    parser.add_argument(
+        "--remove-features",
+        default=None,
+        nargs="+",
+        help="""removes feature files from the dataset, 
+        since they maybe obsolote after modifiying the recordings""",
+    )
+    parser.add_argument(
+        "--update-seg-durs",
+        default=False,
+        action=ActionYesNo,
+        help="""updates the durations in the segment table""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def set_recordings(
+    dataset: PathLike,
+    recordings_file: PathLike,
+    output_dataset: PathLike,
+    remove_features: List[str],
+    update_seg_durs: bool,
+):
+    if output_dataset is None:
+        output_dataset = dataset
+
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.set_recordings(recordings_file, update_seg_durs)
+    if remove_features is not None:
+        for features_name in remove_features:
+            dataset.remove_features(features_name)
+
+    dataset.save(output_dataset)
+
+
+def make_make_from_recordings_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--recordings-file", required=True, help="""recordings set file"""
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def make_from_recordings(
+    dataset: PathLike, recordings_file: PathLike,
+):
+    output_dataset = dataset
+    import pandas as pd
+
+    rec_df = pd.read_csv(recordings_file)
+    seg_df = rec_df[["id"]]
+    segments = SegmentSet(seg_df)
+    dataset = Dataset(segments, recordings=recordings_file)
+    dataset.save(output_dataset)
+
+
+def make_remove_short_segments_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--min-length",
+        required=True,
+        type=float,
+        help="""minimum required length of the segment""",
+    )
+
+    parser.add_argument(
+        "--length-name",
+        default="duration",
+        help="""name of the column indicating the length of the segment""",
+    )
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def remove_short_segments(
+    dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike,
+):
+    if output_dataset is None:
+        output_dataset = dataset
+
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.remove_short_segments(min_length, length_name)
+    dataset.save(output_dataset)
+
+
+def make_rebuild_class_idx_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--class-name", required=True, help="""name of the class type e.g.: speaker"""
+    )
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def rebuild_class_idx(
+    dataset: PathLike, class_name: str, output_dataset: PathLike,
+):
+    if output_dataset is None:
+        output_dataset = dataset
+
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.rebuild_class_idx(class_name)
+    dataset.save(output_dataset)
+
+
+def make_remove_classes_few_segments_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--class-name", required=True, help="""name of the class type e.g.: speaker"""
+    )
+    parser.add_argument(
+        "--min-segs", default=1, type=int, help="""min. num. of segments/class"""
+    )
+    parser.add_argument(
+        "--rebuild-idx",
+        default=False,
+        action=ActionYesNo,
+        help="""regenerate class indexes from 0 to new_num_classes-1""",
+    )
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def remove_classes_few_segments(
+    dataset: PathLike,
+    class_name: str,
+    min_segs: int,
+    rebuild_idx: bool,
+    output_dataset: PathLike,
+):
+    if output_dataset is None:
+        output_dataset = dataset
+
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx)
+    dataset.save(output_dataset)
+
+
+def make_split_train_val_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""input dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--val-prob",
+        default=0.05,
+        type=float,
+        help="""proportion of segments used for val""",
+    )
+    parser.add_argument(
+        "--min-train-samples",
+        default=1,
+        type=int,
+        help="""min. number of training samples / class""",
+    )
+
+    parser.add_argument(
+        "--joint-classes",
+        default=None,
+        nargs="+",
+        help="""types of classes that need to have same classes in train and val""",
+    )
+    parser.add_argument(
+        "--disjoint-classes",
+        default=None,
+        nargs="+",
+        help="""types of classes that need to have different classes in train and val""",
+    )
+    parser.add_argument(
+        "--seed", default=11235813, type=int, help="""random seed""",
+    )
+
+    parser.add_argument(
+        "--train-dataset", required=True, help="""output train dataset dir""",
+    )
+    parser.add_argument(
+        "--val-dataset", required=True, help="""output val dataset dir""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def split_train_val(
+    dataset: PathLike,
+    val_prob: float,
+    joint_classes: List[str],
+    disjoint_classes: List[str],
+    min_train_samples: int,
+    seed: int,
+    train_dataset: PathLike,
+    val_dataset: PathLike,
+):
+    dataset = Dataset.load(dataset, lazy=True)
+    train_ds, val_ds = dataset.split_train_val(
+        val_prob, joint_classes, disjoint_classes, min_train_samples, seed
+    )
+    train_ds.save(train_dataset)
+    val_ds.save(val_dataset)
+
+    num_total = len(dataset)
+    num_train = len(train_ds)
+    num_val = len(val_ds)
+    logging.info(
+        "train: %d (%.2f%%) segments, val: %d (%.2f%%) segments",
+        num_train,
+        num_train / num_total * 100,
+        num_val,
+        num_val / num_total * 100,
+    )
+
+
+def make_copy_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--output-dataset",
+        required=True,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def copy(
+    dataset: PathLike, output_dataset: PathLike,
+):
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.save(output_dataset)
+
+
+def make_add_cols_to_segments_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--dataset", required=True, help="""dataset dir or .yaml file"""
+    )
+    parser.add_argument(
+        "--right-table", required=True, help="table where the new data is"
+    )
+    parser.add_argument(
+        "--columns",
+        required=True,
+        nargs="+",
+        help="""columns to copy to segments table""",
+    )
+    parser.add_argument(
+        "--on", default=["id"], nargs="+", help="""columns to match both tables rows""",
+    )
+    parser.add_argument(
+        "--right-on",
+        default=None,
+        nargs="+",
+        help="""columns to match both tables rows""",
+    )
+
+    parser.add_argument(
+        "--output-dataset",
+        default=None,
+        help="""output dataset dir, if None, we use the same as input""",
+    )
+
+    add_common_args(parser)
+    return parser
+
+
+def add_cols_to_segments(
+    dataset: PathLike,
+    right_table: PathLike,
+    column_names: List[str],
+    on: List[str],
+    right_on: List[str],
+    output_dataset: PathLike,
+):
+    if output_dataset is None:
+        output_dataset = dataset
+
+    dataset = Dataset.load(dataset, lazy=True)
+    dataset.add_cols_to_segments(right_table, column_names, on, right_on)
+    dataset.save(output_dataset)
 
 
 if __name__ == "__main__":
@@ -66,15 +424,15 @@ def add_features(
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
-    for subcommand in subcommands:
+    for subcommand in subcommand_list:
         parser_func = f"make_{subcommand}_parser"
         subparser = globals()[parser_func]()
-        subcommands.add_subcommand(k, subparser)
+        subcommands.add_subcommand(subcommand, subparser)
 
     args = parser.parse_args()
     subcommand = args.subcommand
     kwargs = namespace_to_dict(args)[args.subcommand]
     config_logger(kwargs["verbose"])
     del kwargs["verbose"]
-
+    del kwargs["cfg"]
     globals()[subcommand](**kwargs)
diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py
index 5a5f0b4f..7f61b35a 100755
--- a/hyperion/bin/hyperion_tables.py
+++ b/hyperion/bin/hyperion_tables.py
@@ -8,12 +8,23 @@
 from typing import List, Optional, Union
 
 from hyperion.hyp_defs import config_logger
-from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable,
-                            PathLike, RecordingSet, SegmentSet)
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-
-subcommands = ["cat"]
+from hyperion.utils import (
+    ClassInfo,
+    EnrollmentMap,
+    FeatureSet,
+    InfoTable,
+    PathLike,
+    RecordingSet,
+    SegmentSet,
+)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+subcommand_list = ["cat"]
 table_dict = {
     "segments": SegmentSet,
     "recordings": RecordingSet,
@@ -73,11 +84,11 @@ def cat(
     table_type: str,
     input_files: Union[List[PathLike], None],
     output_file: PathLike,
-    num_table: int,
+    num_tables: int,
     base_idx: int = 1,
 ):
 
-    assert input_files is not None or num_jobs != 0
+    assert input_files is not None or num_tables != 0
     output_file = Path(output_file)
     if input_files is None:
         ext = output_file.suffix
@@ -103,15 +114,15 @@ def cat(
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
-    for subcommand in subcommands:
+    for subcommand in subcommand_list:
         parser_func = f"make_{subcommand}_parser"
         subparser = globals()[parser_func]()
-        subcommands.add_subcommand(k, subparser)
+        subcommands.add_subcommand(subcommand, subparser)
 
     args = parser.parse_args()
     subcommand = args.subcommand
     kwargs = namespace_to_dict(args)[args.subcommand]
     config_logger(kwargs["verbose"])
     del kwargs["verbose"]
-
+    del kwargs["cfg"]
     globals()[subcommand](**kwargs)
diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py
index 4a356037..68e5b22b 100755
--- a/hyperion/bin/make_babble_noise_audio_files.py
+++ b/hyperion/bin/make_babble_noise_audio_files.py
@@ -15,12 +15,15 @@
 from hyperion.io import RandomAccessAudioReader as AR
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from scipy import ndimage, signal
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
-def make_noise(xs):
+def make_noise(xs, max_value):
 
     lens = np.array([x.shape[0] for x in xs])
     max_len = np.max(lens)
@@ -28,73 +31,78 @@ def make_noise(xs):
     for i in range(len(xs)):
         xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len]
 
+    xs[0] -= xs[0].mean()
     for i in range(1, len(xs)):
         xs[0] += xs[i] - xs[i].mean()
 
+    max_x = np.max(np.abs(xs[0]))
+    if max_x > max_value:
+        xs[0] *= max_value / max_x
+
     return xs[0]
 
 
 def make_babble_noise_audio_files(
-    input_path,
+    recordings_file,
     output_path,
-    output_script,
-    write_time_durs_spec,
+    output_recordings_file,
+    write_time_durs,
     min_spks=3,
     max_spks=7,
     num_reuses=5,
     random_seed=112358,
-    **kwargs
+    **kwargs,
 ):
 
     input_args = AR.filter_args(**kwargs)
     output_args = Writer.filter_args(**kwargs)
-    logging.info("input_args={}".format(input_args))
-    logging.info("output_args={}".format(output_args))
+    logging.info(f"input_args={input_args}")
+    logging.info(f"output_args={output_args}")
 
-    rng = np.random.RandomState(seed=random_seed)
+    rng = np.random.default_rng(seed=random_seed)
 
-    if write_time_durs_spec is not None:
+    if write_time_durs is not None:
         okeys = []
         info = []
 
     count = 0
     t1 = time.time()
-    with AR(input_path, **input_args) as reader:
+    with AR(recordings_file, **input_args) as reader, Writer(
+        output_path, output_recordings_file, **output_args
+    ) as writer:
         keys = reader.keys
-        with Writer(output_path, output_script, **output_args) as writer:
-
-            for iters in range(num_reuses):
-                keys = rng.permutation(keys)
-
-                cur_spks = min_spks
+        for iters in range(num_reuses):
+            keys = rng.permutation(keys)
+
+            cur_spks = min_spks
+            utt_list = []
+            for utt_idx in range(len(keys)):
+                if len(utt_list) < cur_spks:
+                    utt_list.append(keys[utt_idx])
+                    continue
+
+                x, fs = reader.read(utt_list)
+                fs = fs[0]
+                y = make_noise(x, reader.wav_scale)
+                babble_id = "babble-%05d" % (count)
+                logging.info("writing file %s", babble_id)
+                writer.write([babble_id], [y], [fs])
+                if write_time_durs is not None:
+                    okeys.append(babble_id)
+                    info.append(y.shape[0] / fs)
+
+                count += 1
                 utt_list = []
-                for utt_idx in range(len(keys)):
-                    if len(utt_list) < cur_spks:
-                        utt_list.append(keys[utt_idx])
-                        continue
-
-                    x, fs = reader.read(utt_list)
-                    fs = fs[0]
-                    y = make_noise(x)
-                    babble_id = "babble-%05d" % (count)
-                    logging.info("writing file % s" % (babble_id))
-                    writer.write([babble_id], [y], [fs])
-                    if write_time_durs_spec is not None:
-                        okeys.append(babble_id)
-                        info.append(y.shape[0] / fs)
-
-                    count += 1
-                    utt_list = []
-                    cur_spks += 1
-                    if cur_spks > max_spks:
-                        cur_spks = min_spks
-
-    if write_time_durs_spec is not None:
-        logging.info("writing time durations to %s" % (write_time_durs_spec))
+                cur_spks += 1
+                if cur_spks > max_spks:
+                    cur_spks = min_spks
+
+    if write_time_durs is not None:
+        logging.info("writing time durations to %s", write_time_durs)
         u2td = Utt2Info.create(okeys, info)
-        u2td.save(write_time_durs_spec)
+        u2td.save(write_time_durs)
 
-    logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1))
+    logging.info("finished making babble files, elapsed-time=%f", time.time() - t1)
 
 
 if __name__ == "__main__":
@@ -102,10 +110,10 @@ def make_babble_noise_audio_files(
     parser = ArgumentParser(description="Creates babble noise by adding speech files")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--input", dest="input_path", required=True)
+    parser.add_argument("--recordings-file", required=True)
     parser.add_argument("--output-path", required=True)
-    parser.add_argument("--output-script", required=True)
-    parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None)
+    parser.add_argument("--output-recordings-file", required=True)
+    parser.add_argument("--write-time-durs", default=None)
 
     AR.add_class_args(parser)
     Writer.add_class_args(parser)
diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py
new file mode 100755
index 00000000..b5972d1b
--- /dev/null
+++ b/hyperion/bin/make_wav2xvector.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Jesus Villalba (Johns Hopkins University)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0) 
+"""
+
+import logging
+import os
+import sys
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+from hyperion.hyp_defs import config_logger
+
+# from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch import TorchModel
+
+# from hyperion.torch.models import SpineNetXVector as SpineXVec
+# from hyperion.torch.models import TDNNXVector as TDXVec
+# from hyperion.torch.models import TransformerXVectorV1 as TFXVec
+# from hyperion.torch.models import EfficientNetXVector as EXVec
+from hyperion.torch.models import ResNet1dXVector as R1dXVec
+from hyperion.torch.models import ResNetXVector as RXVec
+from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec
+from hyperion.torch.models import Wav2ResNetXVector as W2RXVec
+from hyperion.torch.narchs import AudioFeatsMVN as AF
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+
+def init_feats(feats):
+    feat_args = AF.filter_args(**feats)
+    logging.info(f"feat args={feat_args}")
+    logging.info("initializing feature extractor")
+    feat_extractor = AF(trans=True, **feat_args)
+    logging.info(f"feat-extractor={feat_extractor}")
+    return feat_extractor
+
+
+def load_model(model_path):
+    logging.info("loading model %s", model_path)
+    model = TorchModel.auto_load(model_path)
+    logging.info(f"xvector-model={model}")
+    return model
+
+
+def make_wav2xvector(feats, xvector_path, output_path):
+
+    feats = init_feats(feats)
+    xvector_model = load_model(xvector_path)
+    if isinstance(xvector_model, RXVec):
+        model = W2RXVec(feats, xvector_model)
+    elif isinstance(xvector_model, R1dXVec):
+        model = W2R1dXVec(feats, xvector_model)
+    else:
+        TypeError(
+            "Conversion of xvector class=%s not available", xvector_model.__class__
+        )
+
+    logging.info("saving model of class %s to %s", model.__class__, output_path)
+    model.save(output_path)
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(
+        description="""Combines the feature extractor config with XVector model
+        to produce a Wav2XVector model with integrated feature extraction"""
+    )
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    AF.add_class_args(parser, prefix="feats")
+    parser.add_argument("--xvector-path", required=True)
+    parser.add_argument("--output-path", required=True)
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    del args.verbose
+    del args.cfg
+    logging.debug(args)
+
+    make_wav2xvector(**namespace_to_dict(args))
diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py
new file mode 100755
index 00000000..6a275f5c
--- /dev/null
+++ b/hyperion/bin/merge_scores.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+from pathlib import Path
+
+from hyperion.hyp_defs import config_logger
+
+from hyperion.utils import TrialScores
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+
+def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx):
+
+    output_file = Path(output_file)
+    output_file.parent.mkdir(exist_ok=True, parents=True)
+
+    ext = output_file.suffix
+
+    if input_files is None:
+        input_file_base = output_file.with_suffix("")
+        input_files = []
+        for i in range(num_enroll_parts):
+            idx_i = base_idx + i
+            for j in range(num_test_parts):
+                idx_j = base_idx + j
+                input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}")
+                input_files.append(input_file_i)
+
+    if ext == ".h5":
+        # if files are h5 we need to load everything in RAM
+        score_list = []
+        for score_file in input_files:
+            scores = TrialScores.load(score_file)
+            score_list.append(scores)
+
+        scores = TrialScores.merge(score_list)
+        scores.save(output_file)
+    else:
+        has_header = ext in [".csv", ".tsv"]
+        write_header = True
+        with open(output_file, "w", encoding="utf-8") as f_out:
+            for score_file in input_files:
+                with open(score_file) as f_in:
+                    for i, line in enumerate(f_in):
+                        if i == 0 and has_header and not write_header:
+                            continue
+                        f_out.write(line)
+                        write_header = False
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+    parser.add_argument(
+        "--input-files", default=None, nargs="+", help="optional list of input files"
+    )
+    parser.add_argument(
+        "--output-file",
+        required=True,
+        help="""output file, if input-files is None, input files names are derived from it""",
+    )
+    parser.add_argument(
+        "--num-enroll-parts",
+        default=1,
+        type=int,
+        help="""number of parts we divided the enrollment set""",
+    )
+    parser.add_argument(
+        "--num-test-parts",
+        default=1,
+        type=int,
+        help="""number of parts we divided the test set""",
+    )
+
+    parser.add_argument(
+        "--base-idx",
+        default=1,
+        type=int,
+        help="""index of the first job, typically 0 or 1""",
+    )
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
+    )
+
+    args = parser.parse_args()
+    kwargs = namespace_to_dict(args)
+    config_logger(kwargs["verbose"])
+    del kwargs["verbose"]
+    del kwargs["cfg"]
+    merge_scores(**kwargs)
diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py
index 78ac59c1..b2a1bc2b 100755
--- a/hyperion/bin/pack_wav_rirs.py
+++ b/hyperion/bin/pack_wav_rirs.py
@@ -13,8 +13,12 @@
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def pack_wav_rirs(input_path, output_spec, **kwargs):
@@ -32,12 +36,15 @@ def pack_wav_rirs(input_path, output_spec, **kwargs):
             h[h < 1e-3] = 0
             h = np.trim_zeros(h)
             logging.info(
-                "Packing rir %s h_max=%f h_delay=%d h-length=%d"
-                % (key, h_max, h_delay, len(h))
+                "Packing rir %s h_max=%f h_delay=%d h-length=%d",
+                key,
+                h_max,
+                h_delay,
+                len(h),
             )
             writer.write([key], [h])
 
-    logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1))
+    logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1)
 
 
 if __name__ == "__main__":
diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py
index 6af0202c..14da4d07 100755
--- a/hyperion/bin/plot_embedding_tsne_per_class.py
+++ b/hyperion/bin/plot_embedding_tsne_per_class.py
@@ -18,9 +18,14 @@
 from hyperion.np.clustering import AHC
 from hyperion.np.transforms import PCA, LNorm, SklTSNE
 from hyperion.utils import SegmentSet
-from hyperion.utils.math import cosine_scoring
-from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
-                          ArgumentParser, namespace_to_dict)
+from hyperion.utils.math_funcs import cosine_scoring
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ActionYesNo,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 matplotlib.use("Agg")
 colors = ["b", "g", "r", "c", "m", "y", "k"]
diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py
index e90ad0f7..f6723c7d 100755
--- a/hyperion/bin/prepare_data.py
+++ b/hyperion/bin/prepare_data.py
@@ -8,8 +8,12 @@
 
 from hyperion.data_prep import DataPrep
 from hyperion.hyp_defs import config_logger
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 
 def make_parser(data_prep_class):
@@ -33,6 +37,5 @@ def make_parser(data_prep_class):
     config_logger(1)
     data_prep_class = DataPrep.registry[args.subcommand]
     args = namespace_to_dict(args)[args.subcommand]
-
     data_prep = data_prep_class(**args)
     data_prep.prepare()
diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py
index e8adfd16..bda9a503 100755
--- a/hyperion/bin/preprocess_audio_files.py
+++ b/hyperion/bin/preprocess_audio_files.py
@@ -15,13 +15,26 @@
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 from scipy import ndimage, signal
 
 
+def resample_vad(vad, length):
+    step = (len(vad) - 1) / length
+    assert step < 1
+    idx = step * np.arange(length, dtype=float)
+    idx = np.round(idx).astype(int)
+    return vad[idx]
+
+
 def process_vad(vad, length, fs, dilation, erosion):
-    vad = signal.resample(vad, length) > 0.5
+    # vad = signal.resample(vad, length) > 0.5
+    vad = resample_vad(vad, length)
     if dilation > 0:
         iters = int(dilation * fs)
         vad = ndimage.binary_dilation(vad, iterations=iters)
@@ -34,9 +47,9 @@ def process_vad(vad, length, fs, dilation, erosion):
 
 
 def process_audio_files(
-    input_path,
+    recordings_file,
     output_path,
-    output_script,
+    output_recordings_file,
     write_time_durs_spec,
     vad_spec,
     vad_path_prefix,
@@ -44,86 +57,92 @@ def process_audio_files(
     vad_dilation=0,
     vad_erosion=0,
     remove_dc_offset=False,
-    **kwargs
+    **kwargs,
 ):
 
     input_args = AR.filter_args(**kwargs)
     output_args = Writer.filter_args(**kwargs)
-    logging.info("input_args={}".format(input_args))
-    logging.info("output_args={}".format(output_args))
+    logging.info(f"input_args={input_args}")
+    logging.info(f"output_args={output_args}")
 
     if write_time_durs_spec is not None:
         keys = []
         info = []
 
-    with AR(input_path, **input_args) as reader:
-        with Writer(output_path, output_script, **output_args) as writer:
+    with AR(recordings_file, **input_args) as reader, Writer(
+        output_path, output_recordings_file, **output_args
+    ) as writer:
 
-            if vad_spec is not None:
-                logging.info("opening VAD stream: %s" % (vad_spec))
-                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
+        if vad_spec is not None:
+            logging.info("opening VAD stream: %s", vad_spec)
+            v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
-            t1 = time.time()
-            for data in reader:
-                key, x, fs = data
-                logging.info("Processing audio %s" % (key))
-                t2 = time.time()
-
-                tot_samples = x.shape[0]
-                if vad_spec is not None:
-                    num_vad_frames = int(round(tot_samples * vad_fs / fs))
-                    vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype(
-                        "bool", copy=False
-                    )
-                    logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad)))
-                    vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion)
-                    logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad)))
-                    x = x[vad]
-
-                logging.info(
-                    "utt %s detected %f/%f secs (%.2f %%) speech "
-                    % (
-                        key[0],
-                        x.shape[0] / fs,
-                        tot_samples / fs,
-                        x.shape[0] / tot_samples * 100,
-                    )
-                )
+        t1 = time.time()
+        for data in reader:
+            key, x, fs = data
+            logging.info("Processing audio %s", key)
+            t2 = time.time()
 
-                if x.shape[0] > 0:
-                    if remove_dc_offset:
-                        x -= np.mean(x)
-
-                    writer.write([key], [x], [fs])
-                    if write_time_durs_spec is not None:
-                        keys.append(key)
-                        info.append(x.shape[0] / fs)
-
-                    xmax = np.max(x)
-                    xmin = np.min(x)
-                else:
-                    xmax = 0
-                    xmin = 0
-
-                t3 = time.time()
-                dt2 = (t2 - t1) * 1000
-                dt3 = (t3 - t1) * 1000
-                time_dur = len(x) / fs
-                rtf = (time_dur * 1000) / dt3
-                logging.info(
-                    (
-                        "Packed audio %s length=%0.3f secs "
-                        "elapsed-time=%.2f ms. "
-                        "read-time=%.2f ms. write-time=%.2f ms. "
-                        "real-time-factor=%.2f"
-                        "x-range=[%f-%f]"
-                    )
-                    % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax)
+            tot_samples = x.shape[0]
+            if vad_spec is not None:
+                num_vad_frames = int(round(tot_samples * vad_fs / fs))
+                vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype(
+                    "bool", copy=False
                 )
-                t1 = time.time()
+                logging.info("vad=%d/%d", np.sum(vad == 1), len(vad))
+                vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion)
+                logging.info("vad=%d/%d", np.sum(vad == 1), len(vad))
+                x = x[vad]
+
+            logging.info(
+                "utt %s detected %f/%f secs (%.2f %%) speech ",
+                key[0],
+                x.shape[0] / fs,
+                tot_samples / fs,
+                x.shape[0] / tot_samples * 100,
+            )
+
+            if x.shape[0] > 0:
+                if remove_dc_offset:
+                    x -= np.mean(x)
+
+                writer.write([key], [x], [fs])
+                if write_time_durs_spec is not None:
+                    keys.append(key)
+                    info.append(x.shape[0] / fs)
+
+                xmax = np.max(x)
+                xmin = np.min(x)
+            else:
+                xmax = 0
+                xmin = 0
+
+            t3 = time.time()
+            dt2 = (t2 - t1) * 1000
+            dt3 = (t3 - t1) * 1000
+            time_dur = len(x) / fs
+            rtf = (time_dur * 1000) / dt3
+            logging.info(
+                (
+                    "Packed audio %s length=%0.3f secs "
+                    "elapsed-time=%.2f ms. "
+                    "read-time=%.2f ms. write-time=%.2f ms. "
+                    "real-time-factor=%.2f "
+                    "x-range=[%f - %f]"
+                ),
+                key,
+                time_dur,
+                dt3,
+                dt2,
+                dt3 - dt2,
+                rtf,
+                xmin,
+                xmax,
+            )
+            t1 = time.time()
 
     if write_time_durs_spec is not None:
-        logging.info("writing time durations to %s" % (write_time_durs_spec))
+        logging.info("writing time durations to %s", write_time_durs_spec)
         u2td = Utt2Info.create(keys, info)
         u2td.save(write_time_durs_spec)
 
@@ -135,9 +154,9 @@ def process_audio_files(
     )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
-    parser.add_argument("--input", dest="input_path", required=True)
+    parser.add_argument("--recordings-file", required=True)
     parser.add_argument("--output-path", required=True)
-    parser.add_argument("--output-script", required=True)
+    parser.add_argument("--output-recordings-file", required=True)
     parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None)
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py
index 8e1653b1..f132a35c 100755
--- a/hyperion/bin/train_wav2vec2xvector.py
+++ b/hyperion/bin/train_wav2vec2xvector.py
@@ -5,6 +5,7 @@
 """
 import logging
 import multiprocessing
+
 # import sys
 import os
 import time
@@ -17,13 +18,19 @@
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
 from hyperion.torch.metrics import CategoricalAccuracy
-from hyperion.torch.models import (HFHubert2ResNet1dXVector,
-                                   HFWav2Vec2ResNet1dXVector,
-                                   HFWavLM2ResNet1dXVector)
+from hyperion.torch.models import (
+    HFHubert2ResNet1dXVector,
+    HFWav2Vec2ResNet1dXVector,
+    HFWavLM2ResNet1dXVector,
+)
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
@@ -95,7 +102,7 @@ def train_model(gpu_id, args):
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
-        logging.info("trainer args={}".format(trn_args))
+        logging.info(f"trainer args={trn_args}")
     metrics = {"acc": CategoricalAccuracy()}
     trainer = Trainer(
         model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args,
diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py
new file mode 100755
index 00000000..ddf292b8
--- /dev/null
+++ b/hyperion/bin/train_wav2xvector.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+"""
+ Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import multiprocessing
+import os
+from pathlib import Path
+
+import torch
+from hyperion.hyp_defs import config_logger, set_float_cpu
+from hyperion.torch.data import AudioDataset as AD
+from hyperion.torch.data import SegSamplerFactory
+from hyperion.torch.metrics import CategoricalAccuracy
+
+# from hyperion.torch.models import EfficientNetXVector as EXVec
+from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec
+from hyperion.torch.models import Wav2ResNetXVector as RXVec
+
+# from hyperion.torch.models import SpineNetXVector as SpineXVec
+# from hyperion.torch.models import TDNNXVector as TDXVec
+# from hyperion.torch.models import TransformerXVectorV1 as TFXVec
+from hyperion.torch.trainers import XVectorTrainer as Trainer
+from hyperion.torch.utils import ddp
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
+xvec_dict = {
+    "resnet": RXVec,
+    "resnet1d": R1dXVec,
+    # "efficientnet": EXVec,
+    # "tdnn": TDXVec,
+    # "transformer": TFXVec,
+    # "spinenet": SpineXVec,
+}
+
+
+def init_data(partition, rank, num_gpus, **kwargs):
+
+    kwargs = kwargs["data"][partition]
+    ad_args = AD.filter_args(**kwargs["dataset"])
+    sampler_args = kwargs["sampler"]
+    if rank == 0:
+        logging.info("{} audio dataset args={}".format(partition, ad_args))
+        logging.info("{} sampler args={}".format(partition, sampler_args))
+        logging.info("init %s dataset", partition)
+
+    is_val = partition == "val"
+    ad_args["is_val"] = is_val
+    sampler_args["shuffle"] = not is_val
+    dataset = AD(**ad_args)
+
+    if rank == 0:
+        logging.info("init %s samplers", partition)
+
+    sampler = SegSamplerFactory.create(dataset, **sampler_args)
+
+    if rank == 0:
+        logging.info("init %s dataloader", partition)
+
+    num_workers = kwargs["data_loader"]["num_workers"]
+    num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs)
+    return data_loader
+
+
+def init_xvector(num_classes, rank, xvec_class, **kwargs):
+    xvec_args = xvec_class.filter_args(**kwargs["model"])
+    if rank == 0:
+        logging.info("xvector network args={}".format(xvec_args))
+    xvec_args["xvector"]["num_classes"] = num_classes
+    model = xvec_class(**xvec_args)
+    if rank == 0:
+        logging.info("x-vector-model={}".format(model))
+    return model
+
+
+def train_xvec(gpu_id, args):
+
+    config_logger(args.verbose)
+    del args.verbose
+    logging.debug(args)
+
+    kwargs = namespace_to_dict(args)
+    torch.manual_seed(args.seed)
+    set_float_cpu("float32")
+
+    ddp_args = ddp.filter_ddp_args(**kwargs)
+    device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args)
+    kwargs["rank"] = rank
+
+    train_loader = init_data(partition="train", **kwargs)
+    val_loader = init_data(partition="val", **kwargs)
+
+    model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs)
+
+    trn_args = Trainer.filter_args(**kwargs["trainer"])
+    if rank == 0:
+        logging.info("trainer args={}".format(trn_args))
+    metrics = {"acc": CategoricalAccuracy()}
+    trainer = Trainer(
+        model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args,
+    )
+    trainer.load_last_checkpoint()
+    trainer.fit(train_loader, val_loader)
+
+    ddp.ddp_cleanup()
+
+
+def make_parser(xvec_class):
+    parser = ArgumentParser()
+
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    train_parser = ArgumentParser(prog="")
+
+    AD.add_class_args(train_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(train_parser, prefix="sampler")
+    train_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+
+    val_parser = ArgumentParser(prog="")
+    AD.add_class_args(val_parser, prefix="dataset", skip={})
+    SegSamplerFactory.add_class_args(val_parser, prefix="sampler")
+    val_parser.add_argument(
+        "--data_loader.num-workers",
+        type=int,
+        default=5,
+        help="num_workers of data loader",
+    )
+    data_parser = ArgumentParser(prog="")
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
+    parser.add_argument("--data", action=ActionParser(parser=data_parser))
+    parser.link_arguments(
+        "data.train.dataset.class_files", "data.val.dataset.class_files"
+    )
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
+
+    xvec_class.add_class_args(parser, prefix="model")
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=xvec_class.valid_train_modes()
+    )
+    ddp.add_ddp_args(parser)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Train Wav2XVector from audio files")
+    parser.add_argument("--cfg", action=ActionConfigFile)
+
+    subcommands = parser.add_subcommands()
+    for k, v in xvec_dict.items():
+        parser_k = make_parser(v)
+        subcommands.add_subcommand(k, parser_k)
+
+    args = parser.parse_args()
+    try:
+        gpu_id = int(os.environ["LOCAL_RANK"])
+    except:
+        gpu_id = 0
+
+    xvec_type = args.subcommand
+    args_sc = vars(args)[xvec_type]
+
+    if gpu_id == 0:
+        try:
+            config_file = Path(args_sc.trainer.exp_path) / "config.yaml"
+            parser.save(args, str(config_file), format="yaml", overwrite=True)
+        except:
+            pass
+
+    args_sc.xvec_class = xvec_dict[xvec_type]
+    # torch docs recommend using forkserver
+    multiprocessing.set_start_method("forkserver")
+    train_xvec(gpu_id, args_sc)
diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py
index e978e219..9d885718 100644
--- a/hyperion/data_prep/__init__.py
+++ b/hyperion/data_prep/__init__.py
@@ -4,6 +4,8 @@
 """
 
 from .data_prep import DataPrep
+from .musan import MusanDataPrep
+from .rirs import RIRSDataPrep
 from .voxceleb2 import VoxCeleb2DataPrep
 from .voxceleb1 import VoxCeleb1DataPrep
 from .voxsrc22 import VoxSRC22DataPrep
diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py
index d9828674..0f654676 100644
--- a/hyperion/data_prep/data_prep.py
+++ b/hyperion/data_prep/data_prep.py
@@ -67,7 +67,8 @@ def _get_recording_duration(recordings, i, n):
     def get_recording_duration(self, recording_set):
 
         import itertools
-        from ..utils import SCPList
+
+        # from ..utils import SCPList #don't remember why I put this here
 
         futures = []
         logging.info("submitting threats...")
diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py
new file mode 100644
index 00000000..abf7a46c
--- /dev/null
+++ b/hyperion/data_prep/musan.py
@@ -0,0 +1,107 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import glob
+import re
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from jsonargparse import ActionYesNo
+from tqdm import tqdm
+
+from ..utils import Dataset, RecordingSet, SegmentSet
+from ..utils.misc import PathLike, urlretrieve_progress
+from .data_prep import DataPrep
+
+
+class MusanDataPrep(DataPrep):
+    """Class for preparing Musan database into tables
+
+    Attributes:
+      corpus_dir: input data directory
+      subset: subset of the data noise, music, speech
+      output_dir: output data directory
+      target_sample_freq: target sampling frequency to convert the audios to.
+    """
+
+    def __init__(
+        self,
+        corpus_dir: PathLike,
+        subset: str,
+        output_dir: PathLike,
+        target_sample_freq: int,
+        num_threads: int = 10,
+        **kwargs,
+    ):
+        super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads)
+        self.subset = subset
+
+    @staticmethod
+    def dataset_name():
+        return "musan"
+
+    @staticmethod
+    def add_class_args(parser):
+        DataPrep.add_class_args(parser)
+        parser.add_argument(
+            "--subset",
+            choices=["noise", "music", "speech"],
+            help="""musan subset in [noise, music, speech]""",
+            required=True,
+        )
+
+    def prepare(self):
+        logging.info(
+            "Peparing Musan %s corpus_dir:%s -> data_dir:%s",
+            self.subset,
+            self.corpus_dir,
+            self.output_dir,
+        )
+        rec_dir = self.corpus_dir / self.subset
+        logging.info("searching audio files in %s", str(rec_dir))
+        rec_files = list(rec_dir.glob("**/*.wav"))
+        if not rec_files:
+            # symlinks? try glob
+            rec_files = [
+                Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
+            ]
+
+        assert len(rec_files) > 0, "recording files not found"
+
+        rec_ids = [f.with_suffix("").name for f in rec_files]
+        storage_paths = [str(f) for f in rec_files]
+        logging.info("making RecordingSet")
+        recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths})
+        recs = RecordingSet(recs)
+        recs.sort()
+
+        logging.info("getting recording durations")
+        self.get_recording_duration(recs)
+        if self.target_sample_freq:
+            recs["target_sample_freq"] = self.target_sample_freq
+
+        logging.info("making SegmentsSet")
+        segments = pd.DataFrame(
+            {
+                "id": rec_ids,
+                "duration": recs.loc[rec_ids, "duration"].values,
+                "noise_type": self.subset,
+            }
+        )
+        segments = SegmentSet(segments)
+        segments.sort()
+        logging.info("making dataset")
+        dataset = Dataset(
+            segments,
+            recordings=recs,
+        )
+        logging.info("saving dataset at %s", self.output_dir)
+        dataset.save(self.output_dir)
+        logging.info(
+            "datasets containts %d segments",
+            len(segments),
+        )
diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py
new file mode 100644
index 00000000..066819a8
--- /dev/null
+++ b/hyperion/data_prep/rirs.py
@@ -0,0 +1,103 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import logging
+import glob
+import re
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from jsonargparse import ActionYesNo
+from tqdm import tqdm
+
+from ..utils import Dataset, RecordingSet, SegmentSet
+from ..utils.misc import PathLike, urlretrieve_progress
+from .data_prep import DataPrep
+
+
+class RIRSDataPrep(DataPrep):
+    """Class for preparing Musan database into tables
+
+    Attributes:
+      corpus_dir: input data directory
+      output_dir: output data directory
+      target_sample_freq: target sampling frequency to convert the audios to.
+    """
+
+    def __init__(
+        self,
+        corpus_dir: PathLike,
+        output_dir: PathLike,
+        target_sample_freq: int,
+        num_threads: int = 10,
+        **kwargs,
+    ):
+        super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads)
+
+    @staticmethod
+    def dataset_name():
+        return "rirs"
+
+    @staticmethod
+    def add_class_args(parser):
+        DataPrep.add_class_args(parser)
+
+    def prepare(self):
+        logging.info(
+            "Peparing RIRS corpus_dir:%s -> data_dir:%s",
+            self.corpus_dir,
+            self.output_dir,
+        )
+        rec_dir = self.corpus_dir
+        rirs_file = self.corpus_dir / "rir_list"
+        if rirs_file.exists():
+            rirs_table = pd.read_csv(
+                rirs_file,
+                sep=" ",
+                header=None,
+                names=["dummy1", "rir_id", "dummy2", "room_id", "rec_files"],
+            )
+            rec_files = [Path(f) for f in rirs_table["rec_files"].values]
+            room_ids = rirs_table["room_id"].values
+        else:
+            logging.info("searching audio files in %s", str(rec_dir))
+            rec_files = list(rec_dir.glob("**/*.wav"))
+            room_ids = None
+            if not rec_files:
+                # symlinks? try glob
+                rec_files = [
+                    Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
+                ]
+
+        assert len(rec_files) > 0, "recording files not found"
+
+        rec_ids = [f.with_suffix("").name for f in rec_files]
+        storage_paths = [str(f) for f in rec_files]
+        logging.info("making RecordingSet")
+        recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths})
+        recs = RecordingSet(recs)
+        recs.sort()
+
+        logging.info("getting recording durations")
+        self.get_recording_duration(recs)
+        if self.target_sample_freq:
+            recs["target_sample_freq"] = self.target_sample_freq
+
+        logging.info("making SegmentsSet")
+        segments = pd.DataFrame(
+            {"id": rec_ids, "duration": recs.loc[rec_ids, "duration"].values,}
+        )
+        if room_ids is not None:
+            segments["room_id"] = room_ids
+        segments = SegmentSet(segments)
+        segments.sort()
+        logging.info("making dataset")
+        dataset = Dataset(segments, recordings=recs,)
+        logging.info("saving dataset at %s", self.output_dir)
+        dataset.save(self.output_dir)
+        logging.info(
+            "datasets containts %d segments", len(segments),
+        )
diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py
index b3958605..025fad37 100644
--- a/hyperion/data_prep/voxceleb1.py
+++ b/hyperion/data_prep/voxceleb1.py
@@ -233,17 +233,19 @@ def prepare(self):
                 Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
             ]
 
+        assert len(rec_files) > 0, "recording files not found"
+
         speakers = [f.parents[1].name for f in rec_files]
         video_ids = [f.parent.name for f in rec_files]
         if self.cat_videos:
+            rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)]
             lists_cat_dir = self.output_dir / "lists_cat"
             lists_cat_dir.mkdir(exist_ok=True, parents=True)
-            uniq_video_ids, uniq_video_idx, video_idx = np.unique(
-                video_ids, return_index=True, return_inverse=True
+            rec_ids, uniq_rec_idx, rec_idx = np.unique(
+                rec_ids, return_index=True, return_inverse=True
             )
-            rec_ids = uniq_video_ids
-            speakers = [speakers[i] for i in uniq_video_idx]
-            rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)]
+            speakers = [speakers[i] for i in uniq_rec_idx]
+            video_ids = [video_ids[i] for i in uniq_rec_idx]
 
             file_paths = []
             futures = []
@@ -256,15 +258,13 @@ def prepare(self):
                         lists_cat_dir,
                         rec_id,
                         rec_files,
-                        video_idx,
+                        rec_idx,
                         i,
                     )
                     futures.append(future)
 
             logging.info("waiting threats...")
             file_paths = [f.result() for f in tqdm(futures)]
-            video_ids = uniq_video_ids
-
         else:
             file_names = [f.with_suffix("").name for f in rec_files]
             if self.use_kaldi_ids:
@@ -331,7 +331,7 @@ def prepare(self):
         dataset = Dataset(
             segments,
             classes={"speaker": speakers, "language_est": languages},
-            recordings={"recordings": recs},
+            recordings=recs,
             enrollments=enrollments,
             trials=trials,
             sparse_trials=False,
diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py
index 29ad3e44..969f2228 100644
--- a/hyperion/data_prep/voxceleb2.py
+++ b/hyperion/data_prep/voxceleb2.py
@@ -148,24 +148,27 @@ def prepare(self):
         df_lang = self._get_langs_est()
         rec_dir = self.corpus_dir / self.subset
         logging.info("searching audio files in %s", str(rec_dir))
-        rec_files = list(rec_dir.glob("**/*.m4a"))
+        rec_files1 = list(rec_dir.glob("**/*.m4a"))
+        rec_files = [Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)]
         if not rec_files:
             # symlinks? try glob
             rec_files = [
-                Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True)
+                Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)
             ]
 
+        assert len(rec_files) > 0, "recording files not found"
+
         speakers = [f.parents[1].name for f in rec_files]
         video_ids = [f.parent.name for f in rec_files]
         if self.cat_videos:
+            rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)]
             lists_cat_dir = self.output_dir / "lists_cat"
             lists_cat_dir.mkdir(exist_ok=True, parents=True)
-            uniq_video_ids, uniq_video_idx, video_idx = np.unique(
-                video_ids, return_index=True, return_inverse=True
+            rec_ids, uniq_rec_idx, rec_idx = np.unique(
+                rec_ids, return_index=True, return_inverse=True
             )
-            rec_ids = uniq_video_ids
-            speakers = [speakers[i] for i in uniq_video_idx]
-            rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)]
+            speakers = [speakers[i] for i in uniq_rec_idx]
+            video_ids = [video_ids[i] for i in uniq_rec_idx]
 
             file_paths = []
             futures = []
@@ -178,15 +181,13 @@ def prepare(self):
                         lists_cat_dir,
                         rec_id,
                         rec_files,
-                        video_idx,
+                        rec_idx,
                         i,
                     )
                     futures.append(future)
 
             logging.info("waiting threats...")
             file_paths = [f.result() for f in tqdm(futures)]
-            video_ids = uniq_video_ids
-
         else:
             file_names = [f.with_suffix("").name for f in rec_files]
             if self.use_kaldi_ids:
@@ -252,7 +253,7 @@ def prepare(self):
         dataset = Dataset(
             segments,
             {"speaker": speakers, "language_est": languages},
-            {"recordings": recs},
+            recs,
         )
         logging.info("saving dataset at %s", self.output_dir)
         dataset.save(self.output_dir)
diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py
index 79369149..f81f6eaf 100644
--- a/hyperion/data_prep/voxsrc22.py
+++ b/hyperion/data_prep/voxsrc22.py
@@ -127,6 +127,9 @@ def prepare_track12_dev(self):
         rec_ids = vox22_segmentid + vox1_segmentid
         rec_files = vox22_rec_files + vox1_rec_files
 
+        assert len(vox22_rec_files) > 0, "vox22 recording files not found"
+        assert len(vox1_rec_files) > 0, "vox1 recording files not found"
+
         recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files})
         recs = RecordingSet(recs)
         recs.sort()
@@ -148,7 +151,7 @@ def prepare_track12_dev(self):
         logging.info("making dataset")
         dataset = Dataset(
             segments,
-            recordings={"recordings": recs},
+            recordings=recs,
             enrollments=enrollments,
             trials=trials,
             sparse_trials=False,
@@ -160,50 +163,6 @@ def prepare_track12_dev(self):
             len(segments),
         )
 
-    #             wav_file = voxsrc22_corpus_dir / file_id
-    #                             wav_file = vox1_corpus_dir / "wav" / file_id
-    #     logging.info("searching audio files in %s", self.vox1_corpus_dir)
-    #     vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav"))
-    #     if not vox1_rec_files:
-    #         # symlinks? try glob
-    #         vox1_rec_files = [
-    #             Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True)
-    #         ]
-
-    #     vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files]
-    #     rec_files =
-
-    #     rec_files = list(self.corpus_dir.glob("**/*.wav"))
-    #     if not rec_files:
-    #         # symlinks? try glob
-    #         rec_files = [
-    #             Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True)
-    #         ]
-
-    # u2s_file = output_dir / "utt2spk"
-    # logging.info("creating utt2spk file %s", u2s_file)
-    # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"])))
-    # with open(u2s_file, "w") as f:
-    #     for file_id in file_ids:
-    #         f.write("%s %s\n" % (file_id, file_id))
-
-    # s2u_file = output_dir / "spk2utt"
-    # logging.info("creating spk2utt file %s", s2u_file)
-    # with open(s2u_file, "w") as f:
-    #     for file_id in file_ids:
-    #         f.write("%s %s\n" % (file_id, file_id))
-
-    # wav_file = output_dir / "wav.scp"
-    # logging.info("creating wav.scp file %s", wav_file)
-    # with open(wav_file, "w") as f:
-    #     for file_id in file_ids:
-    #         if "VoxSRC2022_dev" in file_id:
-    #             wav_file = voxsrc22_corpus_dir / file_id
-    #         else:
-    #             wav_file = vox1_corpus_dir / "wav" / file_id
-
-    #         f.write("%s %s\n" % (file_id, wav_file))
-
     def prepare_track12_test(self):
         logging.info(
             "Preparing VoxSRC22 %s corpus:%s -> %s",
diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py
index 4f33770b..85904eb2 100644
--- a/hyperion/helpers/trial_data_reader.py
+++ b/hyperion/helpers/trial_data_reader.py
@@ -16,7 +16,7 @@
 from ..utils.utt2info import Utt2Info
 
 
-class TrialDataReader(object):
+class TrialDataReader:
     """
     Loads Ndx, enroll file and x-vectors to evaluate PLDA.
     """
diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py
index c4c531ad..a9993768 100644
--- a/hyperion/helpers/vector_class_reader.py
+++ b/hyperion/helpers/vector_class_reader.py
@@ -49,7 +49,7 @@ def __init__(
                     v[0]: int(v[1]) for v in [line.rstrip().split() for line in f]
                 }
 
-        self.rng = np.random.RandomState(vcr_seed)
+        self.rng = np.random.default_rng(vcr_seed)
         self.csplit_max_spc = csplit_max_spc
         self.csplit_min_spc = csplit_min_spc
         self.csplit_mode = csplit_mode
diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py
index 6cf22d5f..eaf76d49 100644
--- a/hyperion/io/ark_data_reader.py
+++ b/hyperion/io/ark_data_reader.py
@@ -223,8 +223,8 @@ def read(
                     self._eof = True
                     break
 
-                row_offset_i = row_offset[i] if row_offset_is_list else row_offset
-                num_rows_i = num_rows[i] if num_rows_is_list else num_rows
+                row_offset_i = row_offset[count] if row_offset_is_list else row_offset
+                num_rows_i = num_rows[count] if num_rows_is_list else num_rows
 
                 binary = init_kaldi_input_stream(self.f)
                 data_i = KaldiMatrix.read(
@@ -269,7 +269,7 @@ def __init__(
         self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs
     ):
         super().__init__(file_path, permissive=False, **kwargs)
-        self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep)
+        self.feature_set = FeatureSet.load(self.file_path)
 
         if self.num_parts > 1:
             self.feature_set = self.feature_set.split(self.part_idx, self.num_parts)
diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py
index 6c152cc5..a1adaef0 100644
--- a/hyperion/io/audio_reader.py
+++ b/hyperion/io/audio_reader.py
@@ -55,7 +55,7 @@ def __init__(
         self,
         recordings: Union[RecordingSet, PathLike],
         segments: Union[SegmentSet, PathLike, None] = None,
-        wav_scale: float = 2 ** 15 - 1,
+        wav_scale: float = 1.0,
     ):
         if not isinstance(recordings, RecordingSet):
             recordings = RecordingSet.load(recordings)
@@ -255,7 +255,7 @@ def __init__(
         self,
         recordings: Union[RecordingSet, PathLike],
         segments: Union[SegmentSet, PathLike, None] = None,
-        wav_scale: float = 2 ** 15 - 1,
+        wav_scale: float = 1.0,
         part_idx: int = 1,
         num_parts: int = 1,
     ):
@@ -373,7 +373,8 @@ def add_class_args(parser, prefix: Optional[str] = None):
 
         parser.add_argument(
             "--wav-scale",
-            default=2 ** 15 - 1,
+            default=1.0,
+            # default=2 ** 15 - 1,
             type=float,
             help=("multiplicative factor for waveform"),
         )
@@ -399,8 +400,7 @@ def add_class_args(parser, prefix: Optional[str] = None):
 
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix,
-                action=ActionParser(parser=parser),
+                "--" + prefix, action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
@@ -411,7 +411,7 @@ def __init__(
         self,
         recordings: Union[RecordingSet, PathLike],
         segments: Union[SegmentSet, PathLike, None] = None,
-        wav_scale: float = 2 ** 15 - 1,
+        wav_scale: float = 1.0,
     ):
         super().__init__(recordings, segments, wav_scale)
 
@@ -524,14 +524,14 @@ def add_class_args(parser, prefix: Optional[str] = None):
 
         parser.add_argument(
             "--wav-scale",
-            default=2 ** 15 - 1,
+            default=1.0,
+            # default=2 ** 15 - 1,
             type=float,
             help=("multiplicative factor for waveform"),
         )
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix,
-                action=ActionParser(parser=parser),
+                "--" + prefix, action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py
index e416c209..ca0dde9f 100644
--- a/hyperion/io/audio_writer.py
+++ b/hyperion/io/audio_writer.py
@@ -27,12 +27,33 @@
     "DOUBLE": "float64",
     "MS_ADPCM": "int16",
     "ULAW": "int16",
-    "PCM_U8": "uint8",
-    "PCM_S8": "int8",
+    "PCM_S8": "int16",
     "VORBIS": "float32",
     "GSM610": "int16",
     "G721_32": "int16",
-    "PCM_24": "int24",
+    "PCM_24": "int32",
+}
+
+scale_32 = 2 ** 31 - 1
+scale_24 = 2 ** 23 - 1
+scale_16 = 2 ** 15 - 1
+scale_8 = 2 ** 7 - 1
+
+
+subtype_to_scale = {
+    "PCM_32": scale_32,
+    "ALAW": scale_16,
+    "IMA_ADPCM": scale_16,
+    "FLOAT": 1,
+    "PCM_16": scale_16,
+    "DOUBLE": 1,
+    "MS_ADPCM": scale_16,
+    "ULAW": scale_16,
+    "PCM_S8": scale_8,
+    "VORBIS": 1,
+    "GSM610": scale_16,
+    "G721_32": scale_16,
+    "PCM_24": scale_24,
 }
 
 
@@ -45,6 +66,7 @@ class AudioWriter(object):
       audio_format:   audio file format
       audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...],
                if None, it uses soundfile defaults (recommended)
+      wav_scale: scale of the input waveform
     """
 
     def __init__(
@@ -53,6 +75,7 @@ def __init__(
         script_path: Optional[PathLike] = None,
         audio_format: str = "wav",
         audio_subtype: Optional[str] = None,
+        wav_scale: float = 1.0,
     ):
         self.output_path = Path(output_path)
         self.script_path = Path(script_path) if script_path is not None else None
@@ -63,9 +86,15 @@ def __init__(
         if audio_subtype is None:
             self.subtype = sf.default_subtype(self.audio_format)
         else:
-            self.subtype = audio_subtype
+            self.subtype = audio_subtype.upper()
             assert sf.check_format(self.audio_format, self.subtype)
 
+        self._dtype = subtype_to_npdtype[self.subtype]
+
+        self.wav_scale = wav_scale
+        # we multiply the audio for this number before saving it.
+        self._output_wav_scale = subtype_to_scale[self.subtype] / wav_scale
+
         self.script_is_scp = False
         self.script_sep = None
         self.f_script = None
@@ -78,7 +107,7 @@ def __init__(
                 self.f_script = open(self.script_path, "w")
             else:
                 self.script_sep = "," if script_ext == ".csv" else "\t"
-                self.f_script = open(self.script_path, "w", "utf-8")
+                self.f_script = open(self.script_path, "w", encoding="utf-8")
                 row = self.script_sep.join(
                     ["id", "storage_path", "duration", "sample_freq"]
                 )
@@ -123,8 +152,7 @@ def write(
             data = [data]
 
         fs_is_list = isinstance(fs, (list, np.ndarray))
-        assert self.subtype in subtype_to_npdtype
-        dtype = subtype_to_npdtype[self.subtype]
+
         output_files = []
         for i, key_i in enumerate(keys):
             assert is_token(key_i), "Token %s not valid" % key_i
@@ -135,7 +163,7 @@ def write(
                 self.audio_format,
             )
             fs_i = int(fs[i]) if fs_is_list else fs
-            data_i = data[i].astype(dtype, copy=False)
+            data_i = (self._output_wav_scale * data[i]).astype(self._dtype, copy=False)
             sf.write(output_file, data_i, fs_i, subtype=self.subtype)
 
             output_files.append(output_file)
@@ -156,14 +184,11 @@ def write(
     @staticmethod
     def filter_args(**kwargs):
         valid_args = (
-            "output_fs",
-            "output_wav_scale",
-            "output_audio_format",
-            "output_audio_subtype",
-        )
-        return dict(
-            (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs
+            "wav_scale",
+            "audio_format",
+            "audio_subtype",
         )
+        return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
 
     @staticmethod
     def add_class_args(parser, prefix=None):
@@ -171,23 +196,27 @@ def add_class_args(parser, prefix=None):
             outer_parser = parser
             parser = ArgumentParser(prog="")
 
-        # parser.add_argument(p1+'output-wav-scale', default=1, type=float,
-        #                      help=('scale to divide the waveform before writing'))
-
         parser.add_argument(
-            "--output-audio-format",
+            "--audio-format",
             default="flac",
             choices=["flac", "ogg", "wav"],
             help=("ouput audio format"),
         )
 
         parser.add_argument(
-            "--output-audio-subtype",
+            "--audio-subtype",
             default=None,
-            choices=["pcm_16", "pcm_24", "float", "double", "vorbis"],
+            choices=["pcm_16", "pcm_24", "pcm_32", "float", "double", "vorbis"],
             help=("coding format for audio file"),
         )
 
+        try:
+            parser.add_argument(
+                "--wav-scale", default="1.0", help=("input waveform scale wrt 1"),
+            )
+        except:
+            pass
+
         if prefix is not None:
             outer_parser.add_argument(
                 "--" + prefix, action=ActionParser(parser=parser),
diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py
index 575c3087..63d463fb 100644
--- a/hyperion/io/hyp_data_reader.py
+++ b/hyperion/io/hyp_data_reader.py
@@ -76,9 +76,8 @@ def read_random_slice(self, key, num_samples, rng, field=""):
         dataset = key + field
         assert dataset in self.f, "Dataset %s not found" % dataset
         num_rows = self.f[dataset].shape[0]
-        # print('hola',num_rows,num_samples,num_rows-num_samples)
-        # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0]
-        index = rng.randint(low=0, high=num_rows - num_samples + 1)
+
+        index = rng.integers(low=0, high=num_rows - num_samples + 1)
         X = self.f[dataset][index : index + num_samples]
         return X, index
 
diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py
index 17f78bc2..fb17cb18 100644
--- a/hyperion/io/packed_audio_reader.py
+++ b/hyperion/io/packed_audio_reader.py
@@ -378,7 +378,8 @@ def add_class_args(parser, prefix=None):
 
         parser.add_argument(
             p1 + "wav-scale",
-            default=2 ** 15 - 1,
+            default=1.0,
+            # default=2 ** 15 - 1,
             type=float,
             help=("multiplicative factor for waveform"),
         )
@@ -633,7 +634,8 @@ def add_class_args(parser, prefix=None):
 
         parser.add_argument(
             p1 + "wav-scale",
-            default=2 ** 15,
+            default=1.0,
+            # default=2 ** 15,
             type=float,
             help=("multiplicative factor for waveform"),
         )
diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py
index 93123247..60e01ef1 100644
--- a/hyperion/io/rw_specifiers.py
+++ b/hyperion/io/rw_specifiers.py
@@ -387,11 +387,11 @@ def create(cls, rspecifier):
                 if archive.suffix == ".csv":
                     df = pd.read_csv(archive, nrows=2)
                     storage_path = df["storage_path"].values[0]
-                    if re.match(r".*\.h5$", scp_f2) is not None:
+                    if re.match(r".*\.h5$", storage_path) is not None:
                         archive_type = ArchiveType.H5
-                    elif re.match(r".*\.ark$", scp_f2) is not None:
+                    elif re.match(r".*\.ark$", storage_path) is not None:
                         archive_type = ArchiveType.ARK
-                    elif re.match(r".*[cvg]$", scp_f2) is not None:
+                    elif re.match(r".*[cvg]$", storage_path) is not None:
                         archive_type = ArchiveType.AUDIO
                     else:
                         raise ValueError(f"Unknown format for {storage_path}")
diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py
index 799db930..1cc1a0be 100644
--- a/hyperion/np/augment/noise_augment.py
+++ b/hyperion/np/augment/noise_augment.py
@@ -26,7 +26,7 @@ class SingleNoiseAugment(object):
       min_snr: mininimum SNR(dB) to sample from.
       max_snr: maximum SNR(dB) to sample from.
       rng:     Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
     """
 
     def __init__(
@@ -46,7 +46,7 @@ def __init__(
         self.cache = None
         self.lock = multiprocessing.Lock()
         if rng is None:
-            self.rng = np.random.RandomState(seed=random_seed)
+            self.rng = np.random.default_rng(seed=random_seed)
         else:
             self.rng = deepcopy(rng)
 
@@ -96,7 +96,7 @@ def forward(self, x):
 
         while noise is None or noise.shape[0] < num_samples:
             with self.lock:
-                noise_idx = self.rng.randint(len(self.noise_keys))
+                noise_idx = self.rng.integers(len(self.noise_keys))
                 key = self.noise_keys[noise_idx]
                 noise_k, fs_k = self.r.read([key])
                 noise_k = noise_k[0]
@@ -112,12 +112,22 @@ def forward(self, x):
                 with self.lock:
                     self.cache = noise_k[need_samples:]
 
+        num_zeros = np.sum(noise == 0)
         with self.lock:
+            # add dither for noises files with many 0s.
+            if num_zeros > len(noise) // 3:
+                noise += 0.0001 * self.rng.standard_normal(
+                    noise.shape, dtype=noise.dtype
+                )
+
             target_snr = self.rng.uniform(self.min_snr, self.max_snr)
+
         scale = self._compute_noise_scale(x, noise, target_snr)
 
         info = {"noise_type": self.noise_type, "snr": target_snr}
-        return x + scale * noise, info
+        y = x + scale * noise
+
+        return y, info
 
     def __call__(self, x):
         return self.forward(x)
@@ -136,7 +146,7 @@ class NoiseAugment(object):
                   is proportional to how often we want to sample a given noise
                   type.
       rng:     Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
     """
 
     def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None):
@@ -166,7 +176,7 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None):
 
         self.lock = multiprocessing.Lock()
         if rng is None:
-            self.rng = np.random.RandomState(seed=random_seed)
+            self.rng = np.random.default_rng(seed=random_seed)
         else:
             self.rng = deepcopy(rng)
 
@@ -177,7 +187,7 @@ def create(cls, cfg, random_seed=112358, rng=None):
         Args:
           cfg: YAML file path or dictionary with noise options.
           rng: Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
 
         Returns:
           NoiseAugment object
@@ -208,7 +218,7 @@ def forward(self, x):
 
         # decide whether to add noise or not
         with self.lock:
-            p = self.rng.random_sample()
+            p = self.rng.random()
 
         if p > self.noise_prob:
             # we don't add noise
diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py
index cf4cc6cb..0b1f3596 100644
--- a/hyperion/np/augment/reverb_augment.py
+++ b/hyperion/np/augment/reverb_augment.py
@@ -39,7 +39,7 @@ class SingleReverbAugment(object):
                   its first sample.
       preload_rirs: if True all RIRS are loaded into RAM.
       rng:     Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
     """
 
     def __init__(
@@ -80,7 +80,7 @@ def __init__(
 
         self.lock = multiprocessing.Lock()
         if rng is None:
-            self.rng = np.random.RandomState(seed=random_seed)
+            self.rng = np.random.default_rng(seed=random_seed)
         else:
             self.rng = deepcopy(rng)
 
@@ -129,7 +129,7 @@ def forward(self, x):
 
         num_samples = x.shape[0]
         with self.lock:
-            rir_idx = self.rng.randint(len(self.rir_keys))
+            rir_idx = self.rng.integers(len(self.rir_keys))
 
         if self.preload_rirs:
             h = self.rirs[rir_idx]
@@ -155,6 +155,7 @@ def forward(self, x):
             "h_max": h_max,
             "h_delay": h_delay,
         }
+
         return y, info
 
     def __call__(self, x):
@@ -176,7 +177,7 @@ class ReverbAugment(object):
       max_reverb_context: number of samples required as left context
                           for the convolution operation.
       rng:     Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
     """
 
     def __init__(
@@ -210,7 +211,7 @@ def __init__(
 
         self.lock = multiprocessing.Lock()
         if rng is None:
-            self.rng = np.random.RandomState(seed=random_seed)
+            self.rng = np.random.default_rng(seed=random_seed)
         else:
             self.rng = deepcopy(rng)
 
@@ -221,7 +222,7 @@ def create(cls, cfg, random_seed=112358, rng=None):
         Args:
           cfg: YAML file path or dictionary with reverb options.
           rng: Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
 
         Returns:
           ReverbAugment object.
@@ -267,7 +268,7 @@ def forward(self, x):
 
         # decide whether to add reverb or not
         with self.lock:
-            p = self.rng.random_sample()
+            p = self.rng.random()
 
         if p > self.reverb_prob:
             # we don't add reverb
diff --git a/hyperion/np/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py
index 0b1233f1..c27ca321 100644
--- a/hyperion/np/augment/speech_augment.py
+++ b/hyperion/np/augment/speech_augment.py
@@ -37,7 +37,7 @@ def create(cls, cfg, random_seed=112358, rng=None):
         Args:
           cfg: YAML file path or dictionary with noise options.
           rng: Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
 
         Returns:
           SpeechAugment object.
diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py
index 18a15651..a648190d 100644
--- a/hyperion/np/augment/speed_augment.py
+++ b/hyperion/np/augment/speed_augment.py
@@ -22,7 +22,7 @@ class SpeedAugment(object):
       keep_length: applies padding or cropping to keep the lenght of the signal.
       random_seed: random seed for random number generator.
       rng:     Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
     """
 
     def __init__(
@@ -34,14 +34,16 @@ def __init__(
         rng=None,
     ):
         logging.info(
-            "init speed augment with prob={}, speed_ratios={}, keep_length={}".
-            format(speed_prob, speed_ratios, keep_length))
+            "init speed augment with prob={}, speed_ratios={}, keep_length={}".format(
+                speed_prob, speed_ratios, keep_length
+            )
+        )
         self.speed_prob = speed_prob
         self.speed_ratios = speed_ratios
         self.keep_length = keep_length
 
         if rng is None:
-            self.rng = np.random.RandomState(seed=random_seed)
+            self.rng = np.random.default_rng(seed=random_seed)
         else:
             self.rng = deepcopy(rng)
 
@@ -52,7 +54,7 @@ def create(cls, cfg, random_seed=112358, rng=None):
         Args:
           cfg: YAML file path or dictionary with noise options.
           rng: Random number generator returned by
-               np.random.RandomState (optional).
+               np.random.default_rng (optional).
 
         Returns:
           NoiseAugment object.
@@ -84,7 +86,7 @@ def forward(self, x):
         """
 
         # decide whether to add noise or not
-        p = self.rng.random_sample()
+        p = self.rng.random()
         if p > self.speed_prob:
             # we don't add speed perturbation
             info = {"speed_ratio": 1}
@@ -98,14 +100,12 @@ def forward(self, x):
         # print(f"1 r={r} {x.shape} {y.shape}", flush=True)
         if self.keep_length:
             if r > 1:
-                dither = np.max(x) / 2**15  # we add some dither in the padding
-                pad_y = dither * np.ones(
-                    (x.shape[-1] - y.shape[-1], ), dtype=y.dtype)
+                dither = np.max(x) / 2 ** 15  # we add some dither in the padding
+                pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype)
                 y = np.concatenate((y, pad_y), axis=-1)
             elif r < 1:
-                y = y[:x.shape[-1]]
+                y = y[: x.shape[-1]]
 
-        # print(f"2 r={r} {x.shape} {y.shape}", flush=True)
         return y, info
 
     def __call__(self, x):
diff --git a/hyperion/np/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py
index 82a84529..e77115cd 100644
--- a/hyperion/np/classifiers/binary_logistic_regression.py
+++ b/hyperion/np/classifiers/binary_logistic_regression.py
@@ -29,7 +29,7 @@ class BinaryLogisticRegression(LogisticRegression):
                        In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight.
                        Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased.
       priors: prior prob for having a positive sample.
-      random_state: RandomState instance or None, optional, default: None
+      random_state: default_rng instance or None, optional, default: None
                     Used when solver == ‘sag’ or ‘liblinear’.
       solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
                  default: ‘liblinear’ Algorithm to use in the optimization problem.
diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py
index 842b850e..f03a05a0 100644
--- a/hyperion/np/classifiers/greedy_fusion.py
+++ b/hyperion/np/classifiers/greedy_fusion.py
@@ -42,8 +42,8 @@ class GreedyFusionBinaryLR(NPModel):
                        In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight.
                        Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased.
       priors: prior prob for having a positive sample.
-      random_state: int, RandomState instance or None, optional, default: None
-                       The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’.
+      random_state: int, default_rng instance or None, optional, default: None
+                       The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If default_rng instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’.
       solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
                  default: ‘liblinear’ Algorithm to use in the optimization problem.
                  For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and
diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py
index a6b8c7cc..f551af14 100644
--- a/hyperion/np/classifiers/linear_gbe.py
+++ b/hyperion/np/classifiers/linear_gbe.py
@@ -10,7 +10,7 @@
 from scipy.special import gammaln
 
 from ...hyp_defs import float_cpu
-from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax
+from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax
 from ..np_model import NPModel
 
 
@@ -426,7 +426,8 @@ def add_class_args(parser, prefix=None):
         parser.add_argument("--name", default="lgbe", help="model name")
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix, action=ActionParser(parser=parser),
+                "--" + prefix,
+                action=ActionParser(parser=parser),
             )
 
     @staticmethod
@@ -468,7 +469,8 @@ def add_eval_args(parser, prefix=None):
         )
         if prefix is not None:
             outer_parser.add_argument(
-                "--" + prefix, action=ActionParser(parser=parser),
+                "--" + prefix,
+                action=ActionParser(parser=parser),
             )
 
     add_argparse_args = add_class_args
diff --git a/hyperion/np/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py
index 8566aeab..37ac9656 100644
--- a/hyperion/np/classifiers/linear_gbe_up.py
+++ b/hyperion/np/classifiers/linear_gbe_up.py
@@ -9,8 +9,13 @@
 from scipy.special import gammaln
 
 from ...hyp_defs import float_cpu
-from ...utils.math import (fullcov_varfloor, int2onehot, invert_pdmat,
-                           logdet_pdmat, softmax)
+from ...utils.math_funcs import (
+    fullcov_varfloor,
+    int2onehot,
+    invert_pdmat,
+    logdet_pdmat,
+    softmax,
+)
 from ..np_model import NPModel
 from .linear_gbe import LinearGBE
 
diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py
index 5d743a46..6a977df9 100644
--- a/hyperion/np/classifiers/linear_svmc.py
+++ b/hyperion/np/classifiers/linear_svmc.py
@@ -10,7 +10,7 @@
 from sklearn.svm import LinearSVC as SVC
 
 from ...hyp_defs import float_cpu
-from ...utils.math import softmax
+from ...utils.math_funcs import softmax
 from ..np_model import NPModel
 
 
@@ -41,7 +41,7 @@ class LinearSVMC(NPModel):
                     The “balanced” mode uses the values of y to automatically adjust
                     weights inversely proportional to class frequencies in the input
                     data as n_samples / (n_classes * np.bincount(y)).
-      random_state: RandomState instance or None, optional, default: None
+      random_state: default_rng instance or None, optional, default: None
       max_iter: int, default: 100
                    Useful only for the newton-cg, sag and lbfgs solvers.
                    Maximum number of iterations taken for the solvers to converge.
@@ -61,7 +61,7 @@ class LinearSVMC(NPModel):
                    penalty and dual will be ignored.
       verbose: int, default: 0
       balance_class_weight: if True and class_weight is None, it makes class_weight="balanced".
-      lr_seed: seed form RandomState, used when random_state is None.
+      lr_seed: seed form default_rng, used when random_state is None.
       labels: list of class labels
     """
 
@@ -93,7 +93,7 @@ def __init__(
             class_weight = "balanced"
 
         if random_state is None:
-            random_state = np.random.RandomState(seed=lr_seed)
+            random_state = np.random.default_rng(seed=lr_seed)
 
         self.use_bias = use_bias
         self.bias_scaling = bias_scaling
diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py
index 8e3d7e2e..4c4c0cfc 100644
--- a/hyperion/np/classifiers/logistic_regression.py
+++ b/hyperion/np/classifiers/logistic_regression.py
@@ -9,7 +9,7 @@
 from sklearn.linear_model import LogisticRegression as LR
 
 from ...hyp_defs import float_cpu
-from ...utils.math import softmax
+from ...utils.math_funcs import softmax
 from ..np_model import NPModel
 
 
@@ -36,7 +36,7 @@ class LogisticRegression(NPModel):
                 Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
                 The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
                 Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.
-      random_state: RandomState instance or None, optional, default: None
+      random_state: default_rng instance or None, optional, default: None
                     Used when solver == ‘sag’ or ‘liblinear’.
       solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
                  default: ‘liblinear’ Algorithm to use in the optimization problem.
@@ -93,7 +93,7 @@ def __init__(
         super().__init__(**kwargs)
 
         if random_state is None:
-            random_state = np.random.RandomState(seed=lr_seed)
+            random_state = np.random.default_rng(seed=lr_seed)
 
         if bias_scaling is None:
             if use_bias and solver == "liblinear":
diff --git a/hyperion/np/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py
index 9e54e0f4..3345dd72 100644
--- a/hyperion/np/classifiers/q_scoring_homo_gbe.py
+++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py
@@ -9,7 +9,7 @@
 from scipy.special import gammaln
 
 from ...hyp_defs import float_cpu
-from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax
+from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax
 from ..np_model import NPModel
 
 
diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py
index 6b54034b..ac5211ef 100644
--- a/hyperion/np/classifiers/svmc.py
+++ b/hyperion/np/classifiers/svmc.py
@@ -12,7 +12,7 @@
 from sklearn.svm import SVC
 
 from ...hyp_defs import float_cpu
-from ...utils.math import softmax
+from ...utils.math_funcs import softmax
 from ...utils.misc import filter_func_args
 from ..np_model import NPModel
 
@@ -49,7 +49,7 @@ def __init__(
             class_weight = "balanced"
 
         if random_state is None:
-            random_state = np.random.RandomState(seed=lr_seed)
+            random_state = np.random.default_rng(seed=lr_seed)
 
         self.C = C
         self.kernel = kernel
diff --git a/hyperion/np/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py
index 5b9eb751..1d578c68 100644
--- a/hyperion/np/feats/energy_vad.py
+++ b/hyperion/np/feats/energy_vad.py
@@ -5,6 +5,7 @@
 import logging
 
 import numpy as np
+from jsonargparse import ActionParser, ArgumentParser
 from scipy.signal import lfilter
 
 from ...hyp_defs import float_cpu
@@ -19,7 +20,7 @@ class EnergyVAD(object):
        sample_frequency:      Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000)
        frame_length:          Frame length in milliseconds (default = 25)
        frame_shift:           Frame shift in milliseconds (default = 10)
-       dither:                Dithering constant (0.0 means no dither) (default = 1)
+       dither:                Dithering constant (0.0 means no dither) (default = 2^(-15))
        snip_edges:            If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True)
        vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
        vad_energy_threshold:  Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5)
@@ -32,7 +33,7 @@ def __init__(
         sample_frequency=16000,
         frame_length=25,
         frame_shift=10,
-        dither=1,
+        dither=1 / 2 ** 15,
         snip_edges=True,
         vad_energy_mean_scale=0.5,
         vad_energy_threshold=5,
@@ -97,7 +98,7 @@ def compute(self, x, return_loge=False):
 
             # add dither
             if self.dither > 0:
-                n = self.dither * np.random.RandomState(seed=len(x)).randn(
+                n = self.dither * np.random.default_rng(seed=len(x)).randn(
                     len(x)
                 ).astype(float_cpu(), copy=False)
                 x = x + n
@@ -174,14 +175,12 @@ def add_class_args(parser, prefix=None):
           parser: Arguments parser
           prefix: Options prefix.
         """
-
-        if prefix is None:
-            p1 = "--"
-        else:
-            p1 = "--" + prefix + "."
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
 
         parser.add_argument(
-            p1 + "sample-frequency",
+            "--sample-frequency",
             default=16000,
             type=int,
             help=(
@@ -191,24 +190,21 @@ def add_class_args(parser, prefix=None):
         )
 
         parser.add_argument(
-            p1 + "frame-length",
-            type=int,
-            default=25,
-            help="Frame length in milliseconds",
+            "--frame-length", type=int, default=25, help="Frame length in milliseconds",
         )
         parser.add_argument(
-            p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds"
+            "--frame-shift", type=int, default=10, help="Frame shift in milliseconds"
         )
 
         parser.add_argument(
-            p1 + "dither",
+            "--dither",
             type=float,
-            default=1,
+            default=1 / 2 ** 15,
             help="Dithering constant (0.0 means no dither)",
         )
 
         parser.add_argument(
-            p1 + "snip-edges",
+            "--snip-edges",
             default=True,
             type=str2bool,
             help=(
@@ -221,7 +217,7 @@ def add_class_args(parser, prefix=None):
         )
 
         parser.add_argument(
-            p1 + "vad-energy-mean-scale",
+            "--vad-energy-mean-scale",
             type=float,
             default=0.5,
             help=(
@@ -231,13 +227,13 @@ def add_class_args(parser, prefix=None):
             ),
         )
         parser.add_argument(
-            p1 + "vad-energy-threshold",
+            "--vad-energy-threshold",
             type=float,
             default=5,
             help="Constant term in energy threshold for MFCC0 for VAD",
         )
         parser.add_argument(
-            p1 + "vad-frames-context",
+            "--vad-frames-context",
             type=int,
             default=0,
             help=(
@@ -246,7 +242,7 @@ def add_class_args(parser, prefix=None):
             ),
         )
         parser.add_argument(
-            p1 + "vad-proportion-threshold",
+            "--vad-proportion-threshold",
             type=float,
             default=0.6,
             help=(
@@ -254,5 +250,7 @@ def add_class_args(parser, prefix=None):
                 "the window that need to have more energy than the threshold"
             ),
         )
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
 
     add_argparse_args = add_class_args
diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py
index cd98840d..b56728b8 100644
--- a/hyperion/np/feats/mfcc.py
+++ b/hyperion/np/feats/mfcc.py
@@ -6,6 +6,7 @@
 from enum import Enum
 
 import numpy as np
+from jsonargparse import ActionParser, ArgumentParser
 from scipy.fftpack import dct
 from scipy.signal import lfilter
 
@@ -72,7 +73,7 @@ class MFCC(object):
        preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97)
        window_type:       Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey')
        use_fft2:          If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True)
-       dither:            Dithering constant (0.0 means no dither) (default = 1)
+       dither:            Dithering constant (0.0 means no dither) (default = 1/2**15)
        fb_type:           Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi')
        low_freq:          Low cutoff frequency for mel bins (default = 20)
        high_freq:         High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0)
@@ -98,7 +99,7 @@ def __init__(
         preemphasis_coeff=0.97,
         window_type="povey",
         use_fft2=True,
-        dither=1,
+        dither=1 / 2 ** 15,
         fb_type="mel_kaldi",
         low_freq=20,
         high_freq=0,
@@ -256,7 +257,7 @@ def compute(self, x, return_fft=False, return_spec=False, return_logfb=False):
 
             # add dither
             if self.dither > 0:
-                n = self.dither * np.random.RandomState(seed=len(x)).randn(
+                n = self.dither * np.random.default_rng(seed=len(x)).randn(
                     len(x)
                 ).astype(float_cpu(), copy=False)
                 x = x + n
@@ -400,14 +401,12 @@ def add_class_args(parser, prefix=None):
           parser: Arguments parser
           prefix: Options prefix.
         """
-
-        if prefix is None:
-            p1 = "--"
-        else:
-            p1 = "--" + prefix + "."
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
 
         parser.add_argument(
-            p1 + "sample-frequency",
+            "--sample-frequency",
             default=16000,
             type=int,
             help="Waveform data sample frequency "
@@ -415,27 +414,22 @@ def add_class_args(parser, prefix=None):
         )
 
         parser.add_argument(
-            p1 + "frame-length",
-            type=int,
-            default=25,
-            help="Frame length in milliseconds",
-        )
-        parser.add_argument(
-            p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds"
+            "--frame-length", type=int, default=25, help="Frame length in milliseconds",
         )
         parser.add_argument(
-            p1 + "fft-length", type=int, default=512, help="Length of FFT"
+            "--frame-shift", type=int, default=10, help="Frame shift in milliseconds"
         )
+        parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT")
 
         parser.add_argument(
-            p1 + "remove-dc-offset",
+            "--remove-dc-offset",
             default=True,
             type=str2bool,
             help="Subtract mean from waveform on each frame",
         )
 
         parser.add_argument(
-            p1 + "preemphasis-coeff",
+            "--preemphasis-coeff",
             type=float,
             default=0.97,
             help="Coefficient for use in signal preemphasis",
@@ -444,30 +438,30 @@ def add_class_args(parser, prefix=None):
         FWF.add_class_args(parser, prefix)
 
         parser.add_argument(
-            p1 + "use-fft2",
+            "--use-fft2",
             default=True,
             type=str2bool,
             help="If true, it uses |X(f)|^2, if false, it uses |X(f)|",
         )
 
         parser.add_argument(
-            p1 + "dither",
+            "--dither",
             type=float,
-            default=1,
+            default=1 / 2 ** 15,
             help="Dithering constant (0.0 means no dither)",
         )
 
         FBF.add_class_args(parser, prefix)
 
         parser.add_argument(
-            p1 + "num-ceps",
+            "--num-ceps",
             type=int,
             default=13,
             help="Number of cepstra in MFCC computation (including C0)",
         )
 
         parser.add_argument(
-            p1 + "snip-edges",
+            "--snip-edges",
             default=True,
             type=str2bool,
             help=(
@@ -480,34 +474,34 @@ def add_class_args(parser, prefix=None):
         )
 
         parser.add_argument(
-            p1 + "energy-floor",
+            "--energy-floor",
             type=float,
             default=0,
             help="Floor on energy (absolute, not relative) in MFCC computation",
         )
 
         parser.add_argument(
-            p1 + "raw-energy",
+            "--raw-energy",
             default=True,
             type=str2bool,
             help="If true, compute energy before preemphasis and windowing",
         )
         parser.add_argument(
-            p1 + "use-energy",
+            "--use-energy",
             default=True,
             type=str2bool,
             help="Use energy (not C0) in MFCC computation",
         )
 
         parser.add_argument(
-            p1 + "cepstral-lifter",
+            "--cepstral-lifter",
             type=float,
             default=22,
             help="Constant that controls scaling of MFCCs",
         )
 
         parser.add_argument(
-            p1 + "input-step",
+            "--input-step",
             default="wave",
             choices=["wave", "fft", "spec", "log_spec", "logfb"],
             help=(
@@ -516,7 +510,7 @@ def add_class_args(parser, prefix=None):
         )
 
         parser.add_argument(
-            p1 + "output-step",
+            "--output-step",
             default="mfcc",
             choices=["fft", "spec", "log_spec", "logfb", "mfcc"],
             help=(
@@ -524,4 +518,7 @@ def add_class_args(parser, prefix=None):
             ),
         )
 
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
+
     add_argparse_args = add_class_args
diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py
index 36afdbf5..d45daba5 100644
--- a/hyperion/np/metrics/__init__.py
+++ b/hyperion/np/metrics/__init__.py
@@ -5,7 +5,10 @@
 
 from .acc import compute_accuracy
 from .confusion_matrix import *
-from .dcf import (compute_act_dcf, compute_dcf, compute_min_dcf,
-                  fast_eval_dcf_eer)
+from .dcf import compute_act_dcf, compute_dcf, compute_min_dcf, fast_eval_dcf_eer
 from .eer import compute_eer, compute_prbep
 from .utils import effective_prior
+from .verification_evaluator import (
+    VerificationEvaluator,
+    VerificationAdvAttackEvaluator,
+)
diff --git a/hyperion/np/metrics/cllr.py b/hyperion/np/metrics/cllr.py
index ec816286..cd97a97c 100644
--- a/hyperion/np/metrics/cllr.py
+++ b/hyperion/np/metrics/cllr.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from ..utils.math import neglogsigmoid
+from ..utils.math_funcs import neglogsigmoid
 from .utils import opt_loglr
 
 
diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py
index 0715d809..e638fd1b 100644
--- a/hyperion/np/metrics/utils.py
+++ b/hyperion/np/metrics/utils.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from ...hyp_defs import float_cpu
-from ...utils.math import logsumexp, softmax
+from ...utils.math_funcs import logsumexp, softmax
 
 
 def effective_prior(p_tar, c_miss, c_fa):
diff --git a/hyperion/np/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py
index 2adf15cf..e35e7cf7 100644
--- a/hyperion/np/metrics/verification_evaluator.py
+++ b/hyperion/np/metrics/verification_evaluator.py
@@ -2,8 +2,6 @@
  Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
-
-
 import copy
 import logging
 import re
@@ -18,13 +16,13 @@
 import matplotlib.pyplot as plt
 
 from ...hyp_defs import float_cpu
-from ...utils import TrialKey, TrialScores
+from ...utils import TrialKey, TrialScores, SparseTrialKey, SparseTrialScores
 from ...utils.trial_stats import TrialStats
 from .dcf import fast_eval_dcf_eer
 from .utils import effective_prior
 
 
-class VerificationEvaluator(object):
+class VerificationEvaluator:
     """Class computes performance metrics for verification problems.
        Same metrics can be obtained from fast_eval_dcf_eer functions
 
@@ -34,21 +32,40 @@ class VerificationEvaluator(object):
        p_tar: target prior float or list/nparray sorted in ascending order
        c_miss: cost of miss
        c_fa: cost of false alarm
-
+       key_name: name describing the key
+       score_name: name describing the score
+       sparse: use sparse versions of TrialScores and Keys
     """
 
-    def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None):
-
+    def __init__(
+        self,
+        key,
+        scores,
+        p_tar,
+        c_miss=None,
+        c_fa=None,
+        key_name=None,
+        score_name=None,
+        sparse=False,
+    ):
         if isinstance(key, str):
-            logging.info("Load key: %s" % key)
-            key = TrialKey.load(key)
+            logging.info("Load key: %s", key)
+            if sparse:
+                key = SparseTrialKey.load(key)
+            else:
+                key = TrialKey.load(key)
 
         if isinstance(scores, str):
-            logging.info("Load scores: %s" % scores)
-            scores = TrialScores.load(scores)
+            logging.info("Load scores: %s", scores)
+            if sparse:
+                scores = SparseTrialScores.load(scores)
+            else:
+                scores = TrialScores.load(scores)
 
         self.key = key
         self.scores = scores.align_with_ndx(key)
+        self.key_name = key_name
+        self.score_name = score_name
 
         # compute effective prior is c_miss and c_fa are given
         if isinstance(p_tar, float):
@@ -56,13 +73,16 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None):
 
         p_tar = np.asarray(p_tar)
         if c_miss is not None and c_fa is not None:
+            assert len(c_miss) == len(p_tar)
+            assert len(c_fa) == len(p_tar)
             c_miss = np.asarray(c_miss)
             c_fa = np.asarray(c_fa)
             p_tar = effective_prior(p_tar, c_miss, c_fa)
 
+        self._p_tar_sort = np.argsort(p_tar)
         self.p_tar = p_tar
 
-    def compute_dcf_eer(self, return_df=False):
+    def compute_dcf_eer(self, return_df=True):
         """
         Computes DCF/EER
 
@@ -74,24 +94,38 @@ def compute_dcf_eer(self, return_df=False):
         """
         logging.info("separating tar/non")
         tar, non = self.scores.get_tar_non(self.key)
+        ntar = len(tar)
+        nnon = len(non)
         logging.info("computing EER/DCF")
-        min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar)
+        min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(
+            tar, non, self.p_tar[self._p_tar_sort]
+        )
+        min_dcf[self._p_tar_sort] = min_dcf.copy()
+        act_dcf[self._p_tar_sort] = act_dcf.copy()
 
         if not return_df:
-            return min_dcf, act_dcf, eer
+            return min_dcf, act_dcf, eer, ntar, nnon
 
         if len(self.p_tar) == 1:
             eer = [eer]
             min_dcf = [min_dcf]
             act_dcf = [act_dcf]
 
-        df = pd.DataFrame({"eer": eer})
-
+        df = pd.DataFrame(
+            {
+                "scores": [self.score_name],
+                "key": [self.key_name],
+                "eer": eer,
+                "eer(%)": eer * 100,
+            }
+        )
         for i in range(len(min_dcf)):
             pi = self.p_tar[i]
             df["min-dcf-%.3f" % (pi)] = min_dcf[i]
             df["act-dcf-%.3f" % (pi)] = act_dcf[i]
 
+        df["num_targets"] = ntar
+        df["num_nontargets"] = nnon
         return df
 
 
@@ -116,9 +150,7 @@ class VerificationAdvAttackEvaluator(VerificationEvaluator):
     def __init__(
         self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None
     ):
-        super(VerificationAdvAttackEvaluator, self).__init__(
-            key, scores, p_tar, c_miss, c_fa
-        )
+        super().__init__(key, scores, p_tar, c_miss, c_fa)
         if not isinstance(attack_scores, list):
             attack_scores = [attack_scores]
         if not isinstance(attack_stats, list):
@@ -133,7 +165,7 @@ def __init__(
         if isinstance(attack_scores[0], str):
             l = []
             for file_path in attack_scores:
-                logging.info("Load attack scores: %s" % file_path)
+                logging.info("Load attack scores: %s", file_path)
                 scores = TrialScores.load(file_path)
                 l.append(scores)
             attack_scores = l
@@ -151,7 +183,7 @@ def __init__(
         if isinstance(attack_stats[0], str):
             l = []
             for file_path in attack_stats:
-                logging.info("Load attack stats: %s" % file_path)
+                logging.info("Load attack stats: %s", file_path)
                 scores = TrialStats.load(file_path)
                 l.append(scores)
             attack_stats = l
@@ -216,7 +248,7 @@ def compute_dcf_eer_vs_stats(
         stat_bins,
         attacked_trials="all",
         higher_better=False,
-        return_df=False,
+        return_df=True,
     ):
         """
         Computes DCF/EER versus SNR/Linf/etc curves
@@ -307,7 +339,7 @@ def find_best_attacks(
         threshold=None,
         prior_idx=0,
         higher_better=False,
-        return_df=False,
+        return_df=True,
     ):
         """
         Find the best attacks from the point of view of some of the stats. E.g.,
diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py
index b8f8bb54..67872315 100644
--- a/hyperion/np/pdfs/core/normal.py
+++ b/hyperion/np/pdfs/core/normal.py
@@ -7,11 +7,20 @@
 import scipy.linalg as la
 
 from ....hyp_defs import float_cpu
-from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat,
-                            logdet_pdmat, symmat2vec, vec2symmat)
-from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D,
-                                plot_gaussian_ellipsoid_2D,
-                                plot_gaussian_ellipsoid_3D)
+from ....utils.math_funcs import (
+    fullcov_varfloor,
+    invert_pdmat,
+    invert_trimat,
+    logdet_pdmat,
+    symmat2vec,
+    vec2symmat,
+)
+from ....utils.plotting import (
+    plot_gaussian_1D,
+    plot_gaussian_3D,
+    plot_gaussian_ellipsoid_2D,
+    plot_gaussian_ellipsoid_3D,
+)
 from .exp_family import ExpFamily
 
 
@@ -213,7 +222,7 @@ def sample(self, num_samples, rng=None, seed=1024):
         assert self.is_init
 
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
         return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype(
             float_cpu()
         )
diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py
index c9986f4c..23535112 100644
--- a/hyperion/np/pdfs/core/normal_diag_cov.py
+++ b/hyperion/np/pdfs/core/normal_diag_cov.py
@@ -7,9 +7,12 @@
 from scipy.special import erf
 
 from ....hyp_defs import float_cpu
-from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D,
-                                plot_gaussian_ellipsoid_2D,
-                                plot_gaussian_ellipsoid_3D)
+from ....utils.plotting import (
+    plot_gaussian_1D,
+    plot_gaussian_3D,
+    plot_gaussian_ellipsoid_2D,
+    plot_gaussian_ellipsoid_3D,
+)
 from .exp_family import ExpFamily
 
 
@@ -183,7 +186,7 @@ def sample(self, num_samples, rng=None, seed=1024):
         """
         assert self.is_init
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
         x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu())
         return self.mu + 1.0 / self.cholLambda * x
 
diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py
index 80232e36..92d9c371 100644
--- a/hyperion/np/pdfs/hmm/hmm.py
+++ b/hyperion/np/pdfs/hmm/hmm.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from ....hyp_defs import float_cpu
-from ....utils.math import logsumexp, softmax
+from ....utils.math_funcs import logsumexp, softmax
 from ..core import PDF
 
 
@@ -232,7 +232,7 @@ def viterbi_decode(self, x, nbest=1):
 
     def sample(self, num_seqs, num_steps, rng=None, seed=1024):
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
 
         x = np.zeros((num_seqs, num_steps, self.num_states), dtype=float_cpu())
         x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,))
diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py
index 041431fb..6e2b79e3 100644
--- a/hyperion/np/pdfs/jfa/jfa_total.py
+++ b/hyperion/np/pdfs/jfa/jfa_total.py
@@ -7,8 +7,13 @@
 from scipy import linalg as la
 
 from ....hyp_defs import float_cpu
-from ....utils.math import (invert_pdmat, invert_trimat, logdet_pdmat,
-                            symmat2vec, vec2symmat)
+from ....utils.math_funcs import (
+    invert_pdmat,
+    invert_trimat,
+    logdet_pdmat,
+    symmat2vec,
+    vec2symmat,
+)
 from ..core.pdf import PDF
 
 
diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py
index 5560882c..2186522e 100644
--- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py
+++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from ....hyp_defs import float_cpu
-from ....utils.math import logsumexp, softmax
+from ....utils.math_funcs import logsumexp, softmax
 from ....utils.queues import GeneratorQueue
 from ..core import PDF
 
diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py
index ca197142..7b080dae 100644
--- a/hyperion/np/pdfs/mixtures/gmm.py
+++ b/hyperion/np/pdfs/mixtures/gmm.py
@@ -8,12 +8,22 @@
 from scipy.special import erf
 
 from ....hyp_defs import float_cpu
-from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat,
-                            logdet_pdmat, logsumexp, softmax, symmat2vec,
-                            vec2symmat)
-from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D,
-                                plot_gaussian_ellipsoid_2D,
-                                plot_gaussian_ellipsoid_3D)
+from ....utils.math_funcs import (
+    fullcov_varfloor,
+    invert_pdmat,
+    invert_trimat,
+    logdet_pdmat,
+    logsumexp,
+    softmax,
+    symmat2vec,
+    vec2symmat,
+)
+from ....utils.plotting import (
+    plot_gaussian_1D,
+    plot_gaussian_3D,
+    plot_gaussian_ellipsoid_2D,
+    plot_gaussian_ellipsoid_3D,
+)
 from ...clustering import KMeans
 from ..core import Normal
 from .exp_family_mixture import ExpFamilyMixture
@@ -292,7 +302,7 @@ def sample(self, num_samples, rng=None, seed=1024, r=None):
           Generated samples with shape (num_samples, x_dim).
         """
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
 
         if r is None:
             r = rng.multinomial(1, self.pi, size=(num_samples,))
diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py
index 90141573..7589243e 100644
--- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py
+++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py
@@ -8,10 +8,13 @@
 from scipy.special import erf
 
 from ....hyp_defs import float_cpu
-from ....utils.math import logsumexp, softmax
-from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D,
-                                plot_gaussian_ellipsoid_2D,
-                                plot_gaussian_ellipsoid_3D)
+from ....utils.math_funcs import logsumexp, softmax
+from ....utils.plotting import (
+    plot_gaussian_1D,
+    plot_gaussian_3D,
+    plot_gaussian_ellipsoid_2D,
+    plot_gaussian_ellipsoid_3D,
+)
 from ...clustering import KMeans
 from .exp_family_mixture import ExpFamilyMixture
 
@@ -262,7 +265,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None):
           Generated samples with shape (num_samples, x_dim).
         """
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
 
         if r is None:
             r = rng.multinomial(1, self.pi, size=(num_samples,))
diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py
index 4dc8f46e..6ef7c891 100644
--- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py
+++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py
@@ -7,10 +7,13 @@
 from scipy.special import erf
 
 from ....hyp_defs import float_cpu
-from ....utils.math import logsumexp, softmax
-from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D,
-                                plot_gaussian_ellipsoid_2D,
-                                plot_gaussian_ellipsoid_3D)
+from ....utils.math_funcs import logsumexp, softmax
+from ....utils.plotting import (
+    plot_gaussian_1D,
+    plot_gaussian_3D,
+    plot_gaussian_ellipsoid_2D,
+    plot_gaussian_ellipsoid_3D,
+)
 from ...clustering import KMeans
 from .gmm_diag_cov import GMMDiagCov
 
@@ -193,7 +196,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None):
           Generated samples with shape (num_samples, x_dim).
         """
         if rng is None:
-            rng = np.random.RandomState(seed)
+            rng = np.random.default_rng(seed)
 
         if r is None:
             r = rng.multinomial(1, self.pi, size=(num_samples,))
diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py
index 183725a7..af8c5d8b 100644
--- a/hyperion/np/pdfs/plda/frplda.py
+++ b/hyperion/np/pdfs/plda/frplda.py
@@ -7,7 +7,7 @@
 from scipy import linalg as sla
 
 from ....hyp_defs import float_cpu
-from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat
+from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat
 from .plda_base import PLDABase
 
 
@@ -465,7 +465,7 @@ def sample(
         assert self.is_init
 
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
 
         Sb = invert_pdmat(self.B, return_inv=True)[-1]
         chol_Sb = sla.cholesky(Sb, lower=False)
diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py
index fd2eb9a9..76299970 100644
--- a/hyperion/np/pdfs/plda/plda.py
+++ b/hyperion/np/pdfs/plda/plda.py
@@ -7,7 +7,7 @@
 from scipy import linalg as sla
 
 from ....hyp_defs import float_cpu
-from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat
+from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat
 from .plda_base import PLDABase
 
 
@@ -674,7 +674,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024):
           Generated samples with shape (num_samples, x_dim).
         """
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
 
         x_dim = self.mu.shape[0]
 
diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py
index f9322d26..5d397183 100644
--- a/hyperion/np/pdfs/plda/splda.py
+++ b/hyperion/np/pdfs/plda/splda.py
@@ -6,7 +6,7 @@
 from scipy import linalg as sla
 
 from ....hyp_defs import float_cpu
-from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat
+from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat
 from .plda_base import PLDABase
 
 
@@ -502,7 +502,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024):
           Generated samples with shape (num_samples, x_dim).
         """
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
 
         Sw = invert_pdmat(self.W, return_inv=True)[-1]
         chol_Sw = sla.cholesky(Sw, lower=False)
diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py
index 3f60c4be..ebabc6ec 100644
--- a/hyperion/np/transforms/skl_tsne.py
+++ b/hyperion/np/transforms/skl_tsne.py
@@ -23,7 +23,7 @@ class SklTSNE(NPModel):
       metric: the metric to use when calculating distance between instances in ['cosine', 'euclidean', 'l1', 'l2', 'precomputed'] or callable function.
       init: initialization method in ['random', 'pca'] or embedding matrix of shape (num_samples, num_comp)
       verbose: verbosity level.
-      rng: RandomState instance
+      rng: default_rng instance
       rng_seed: seed for random number generator
       method: gradient calculation method in [‘barnes_hut’, 'exact']
       angle: angle thetha in Barnes-Hut TSNE
@@ -53,7 +53,7 @@ def __init__(
         super().__init__(**kwargs)
         self.rng_seed = rng_seed
         if rng is None:
-            rng = np.random.RandomState(seed=rng_seed)
+            rng = np.random.default_rng(seed=rng_seed)
 
         self._tsne = TSNE(
             n_components=tsne_dim,
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index fa675fdb..f91d7d96 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -304,6 +304,7 @@ def __getitem__(self, segment):
         x, fs = self._read_audio(seg_id, start, duration)
         x, fs = self._resample(x, fs)
         data = {"seg_id": seg_id, "sample_freq": fs}
+
         if self.augmenters:
             # augmentations
             if duration == 0:
@@ -324,6 +325,17 @@ def __getitem__(self, segment):
 
         seg_info = self._get_segment_info(seg_id)
         data.update(seg_info)
+        if np.any(~np.isfinite(data["x"])):
+            print(
+                "zzz",
+                x.max(),
+                x.min(),
+                x.mean(),
+                data["x"].max(),
+                data["x"].min(),
+                data["x"].mean(),
+                flush=True,
+            )
         return data
 
     @staticmethod
diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py
index a8398dac..6d0b4df4 100644
--- a/hyperion/torch/layers/audio_feats_factory.py
+++ b/hyperion/torch/layers/audio_feats_factory.py
@@ -315,7 +315,7 @@ def add_class_args(parser, prefix=None):
         parser.add_argument(
             "--dither",
             type=float,
-            default=1,
+            default=1.0 / 2 ** 15,
             help="Dithering constant (0.0 means no dither)",
         )
 
diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py
index 06838ddd..29b6cdaa 100644
--- a/hyperion/torch/models/__init__.py
+++ b/hyperion/torch/models/__init__.py
@@ -7,11 +7,19 @@
 from .transducer import RNNRNNTransducer, RNNTransducer
 from .vae.vae import VAE
 from .vae.vq_vae import VQVAE
-from .wav2transducer import (HFWav2Vec2ConformerV1RNNTransducer,
-                             HFWav2Vec2RNNRNNTransducer,
-                             HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer)
-from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector,
-                           HFWavLM2ResNet1dXVector)
+from .wav2transducer import (
+    HFWav2Vec2ConformerV1RNNTransducer,
+    HFWav2Vec2RNNRNNTransducer,
+    HFWav2Vec2RNNTransducer,
+    HFWav2Vec2Transducer,
+)
+from .wav2xvectors import (
+    HFHubert2ResNet1dXVector,
+    HFWav2Vec2ResNet1dXVector,
+    HFWavLM2ResNet1dXVector,
+    Wav2ResNetXVector,
+    Wav2ResNet1dXVector,
+)
 from .xvectors.efficient_net_xvector import EfficientNetXVector
 from .xvectors.resnet1d_xvector import ResNet1dXVector
 from .xvectors.resnet_xvector import ResNetXVector
diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py
index 2272793e..3a0f1dee 100644
--- a/hyperion/torch/models/plda/splda.py
+++ b/hyperion/torch/models/plda/splda.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn as nn
 
-from ...utils.math import invert_trimat
+from ...utils.math_funcs import invert_trimat
 from .plda_base import PLDABase
 
 
diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
index c2bcdf99..24ab5bbb 100644
--- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
+++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
@@ -224,7 +224,7 @@ def extract_embed(
     ):
 
         if vad_samples is not None:
-            x, x_lengths = remove_silence(x, x_lengths)
+            x, x_lengths = remove_silence(x, vad_samples, x_lengths)
 
         feats, _, feat_lengths = self.forward_feats(
             x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks
@@ -301,7 +301,7 @@ def set_train_mode(self, mode):
 
         logging.info("train mode set to %s", mode)
 
-        if "nograd" in mode:
+        if "nograd" in mode or mode == "ft-embed-affine":
             logging.info("using torch.no_grad for hf_feats")
             self._hf_context = torch.no_grad()
         else:
diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py
index 0d9f1bc4..0e4faded 100644
--- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py
+++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py
@@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None):
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+        child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"])
+        base_args["xvector"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        ResNet1dXVector.add_finetune_args(parser, prefix="xvector")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py
index 1f7283a0..11d643af 100644
--- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py
+++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py
@@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None):
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
+
+    @staticmethod
+    def filter_finetune_args(**kwargs):
+        base_args = {}
+        child_args = ResNetXVector.filter_finetune_args(**kwargs["xvector"])
+        base_args["xvector"] = child_args
+        return base_args
+
+    @staticmethod
+    def add_finetune_args(parser, prefix=None):
+        if prefix is not None:
+            outer_parser = parser
+            parser = ArgumentParser(prog="")
+
+        ResNetXVector.add_finetune_args(parser, prefix="xvector")
+
+        if prefix is not None:
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py
index 4c21f478..4bbc0c4c 100644
--- a/hyperion/torch/models/wav2xvectors/wav2xvector.py
+++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py
@@ -2,6 +2,7 @@
  Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
+import contextlib
 import logging
 
 from jsonargparse import ActionParser, ArgumentParser
@@ -35,6 +36,23 @@ def __init__(self, feats, xvector):
 
         self.feats = feats
         self.xvector = xvector
+        self._feats_context = contextlib.nullcontext()
+
+    @property
+    def sample_frequency(self):
+        return self.feats.sample_frequency
+
+    def compute_prototype_affinity(self):
+        return self.xvector.compute_prototype_affinity()
+
+    def update_loss_margin(self, epoch):
+        """Updates the value of the margin in AAM/AM-softmax losses
+           given the epoch number
+
+        Args:
+          epoch: epoch which is about to start
+        """
+        self.xvector.update_loss_margin(epoch)
 
     def rebuild_output_layer(
         self,
@@ -58,8 +76,9 @@ def rebuild_output_layer(
             num_subcenters=num_subcenters,
         )
 
-    def compute_prototype_affinity(self):
-        return self.xvector.compute_prototype_affinity()
+    def change_config(self, xvector):
+        logging.info("changing wav2xvector config")
+        self.xvector.change_config(**xvector)
 
     def forward(
         self,
@@ -73,15 +92,28 @@ def forward(
         return_output=True,
     ):
 
-        if vad_samples is not None:
-            x, x_lengths = remove_silence(x, x_lengths)
-        feats, feat_lengths = self.feats(x, x_lengths)
-        if vad_feats is not None:
-            feats, feat_lengths = remove_silence(feats, feat_lengths)
-
-        # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1))
-        return self.xvector(feats, feat_lengths, y, enc_layers, classif_layers,
-                            return_output)
+        with self._feats_context:
+            if vad_samples is not None:
+                x, x_lengths = remove_silence(x, vad_samples, x_lengths)
+
+            feats, feat_lengths = self.feats(x, x_lengths)
+            if vad_feats is not None:
+                feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths)
+
+        n = torch.sum(~torch.isfinite(feats))
+        if n > 0:
+            print(
+                "feats",
+                n,
+                torch.sum(torch.isnan(feats)),
+                torch.sum(torch.any(torch.isnan(x), dim=-1)),
+                x.dtype,
+                feats.dtype,
+                flush=True,
+            )
+        return self.xvector(
+            feats, feat_lengths, y, enc_layers, classif_layers, return_output
+        )
 
     def extract_embed(
         self,
@@ -94,18 +126,54 @@ def extract_embed(
         detach_chunks=False,
     ):
 
-        if vad_samples is not None:
-            x, x_lengths = remove_silence(x, x_lengths)
-        feats, feat_lengths = self.feats(x, x_lengths)
-        if vad_feats is not None:
-            feats, feat_lengths = remove_silence(feats, feat_lengths)
+        with self._feats_context:
+            if vad_samples is not None:
+                x, x_lengths = remove_silence(x, vad_samples, x_lengths)
 
-        feats = feats.transpose(1, 2)
-        return self.xvector.extract_embed(feats, feat_lengths, chunk_length,
-                                          embed_layer, detach_chunks)
+            feats, feat_lengths = self.feats(x, x_lengths)
+            if vad_feats is not None:
+                feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths)
+
+            chunk_length = int(chunk_length * feats.shape[1] / x.shape[-1])
+
+        return self.xvector.extract_embed(
+            feats, feat_lengths, chunk_length, embed_layer, detach_chunks
+        )
 
     def set_train_mode(self, mode):
-        self.xvector.set_train_mode(mode)
+        if mode == self._train_mode:
+            return
+
+        if mode == "full-feats-grad":
+            self._feats_context = contextlib.nullcontext()
+            xvector_mode = "full"
+        else:
+            logging.info("using torch.no_grad for feats")
+            self._feats_context = torch.no_grad()
+
+        self.xvector.set_train_mode(xvector_mode)
+        self._train_mode = mode
+
+    def _train(self, train_mode: str):
+
+        self.feats.train()
+        if train_mode in ["frozen"]:
+            super()._train(train_mode)
+        elif train_mode in ["full-feats-grad", "full"]:
+            self.xvector._train("full")
+        elif train_mode == "ft-embed-affine":
+            self.xvector._train("ft-embed_affine")
+        else:
+            raise ValueError(f"invalid train_mode={train_mode}")
+
+    @staticmethod
+    def valid_train_modes():
+        return [
+            "full",
+            "frozen",
+            "ft-embed-affine",
+            "full-feats-grad",
+        ]
 
     def get_config(self):
         feat_cfg = self.feats.get_config()
@@ -119,7 +187,7 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     @staticmethod
-    def filter_args(*kwargs):
+    def filter_args(**kwargs):
         """Filters Wav2XVector class arguments from arguments dictionary.
 
         Args:
@@ -150,5 +218,4 @@ def add_class_args(parser, prefix=None):
         AudioFeatsMVN.add_class_args(parser, prefix="feats")
 
         if prefix is not None:
-            outer_parser.add_argument("--" + prefix,
-                                      action=ActionParser(parser=parser))
+            outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py
index a9ad224e..440c22b6 100644
--- a/hyperion/torch/narchs/audio_feats_mvn.py
+++ b/hyperion/torch/narchs/audio_feats_mvn.py
@@ -50,6 +50,10 @@ def __init__(
         self.trans = trans
         self.aug_after_mvn = aug_after_mvn
 
+    @property
+    def sample_frequency(self):
+        return self.audio_feats.fs
+
     @property
     def fs(self):
         return self.audio_feats.fs
diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py
index 0cb887ca..e7020e1d 100644
--- a/hyperion/torch/torch_model.py
+++ b/hyperion/torch/torch_model.py
@@ -2,11 +2,11 @@
  Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
-import os
 from collections import OrderedDict as ODict
 from copy import deepcopy
 from enum import Enum
 from typing import Optional
+from pathlib import Path
 
 import torch
 import torch.nn as nn
@@ -110,13 +110,11 @@ def valid_train_modes():
         return ["full", "frozen"]
 
     def save(self, file_path):
-        file_dir = os.path.dirname(file_path)
-        if not (os.path.isdir(file_dir)):
-            os.makedirs(file_dir, exist_ok=True)
-
-        config = self.get_config()
+        file_path = Path(file_path)
+        file_path.parent.mkdir(parents=True, exist_ok=True)
         torch.save(
-            {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}
+            {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()},
+            file_path,
         )
 
     @staticmethod
@@ -176,7 +174,7 @@ def _fix_cfg_compatibility(class_obj, cfg):
           Fixed configuration dictionary.
         """
         # for compatibility with older x-vector models
-        XVector = torch_model_registry["xvector"]
+        XVector = TorchModel.registry["XVector"]
         if issubclass(class_obj, XVector):
             # We renamed AM-softmax scale parameer s to cos_scale
             if "s" in cfg:
@@ -195,8 +193,9 @@ def auto_load(file_path, extra_objs={}, map_location=None):
         cfg = model_data["model_cfg"]
         class_name = cfg["class_name"]
         del cfg["class_name"]
-        if class_name in torch_model_registry:
-            class_obj = torch_model_registry[class_name]
+        print(TorchModel.registry)
+        if class_name in TorchModel.registry:
+            class_obj = TorchModel.registry[class_name]
         elif class_name in extra_objs:
             class_obj = extra_objs[class_name]
         else:
diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py
index fe72339f..4d4dd55a 100644
--- a/hyperion/utils/class_info.py
+++ b/hyperion/utils/class_info.py
@@ -100,3 +100,19 @@ def cat(cls, tables):
             )
             df["class_idx"].drop(columns=["class_idx"], inplace=True)
         return cls(df)
+
+    def filter(
+        self,
+        predicate=None,
+        items=None,
+        iindex=None,
+        columns=None,
+        by="id",
+        keep=True,
+        rebuild_idx=False,
+    ):
+        new_class_info = super().filter(predicate, items, iindex, columns, by, keep)
+        if rebuild_idx:
+            new_class_info.add_class_idx()
+
+        return new_class_info
diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py
index d1d969fb..dd446576 100644
--- a/hyperion/utils/dataset.py
+++ b/hyperion/utils/dataset.py
@@ -4,13 +4,14 @@
 """
 import logging
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import List, Dict, Optional, Union
 from copy import deepcopy
 import math
 import numpy as np
 import pandas as pd
 import yaml
 
+from .info_table import InfoTable
 from .class_info import ClassInfo
 from .feature_set import FeatureSet
 from .misc import PathLike
@@ -30,7 +31,7 @@ class Dataset:
     Attributes:
       segments:     SegmentSet object or path to it.
       classes:      Dictionary of ClassInfo objects or paths to then
-      recordings:   Dictionary of RecordingSet objects or paths to then
+      recordings:   RecordingSet object or paths to then
       features:     Dictionary of FeatureSet objects or paths to then
       enrollments:  Dictionary of EnrollmentMap objects or paths to then
       trials:       Dictionary of TrialKey/TrialNdx/SparseTrialKey objects
@@ -45,7 +46,7 @@ def __init__(
         self,
         segments: Union[SegmentSet, PathLike],
         classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None,
-        recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None,
+        recordings: Optional[Union[RecordingSet, PathLike]] = None,
         features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None,
         enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None,
         trials: Optional[
@@ -65,24 +66,65 @@ def __init__(
 
         self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo)
 
-        self._recordings, self._recordings_paths = self._parse_dict_args(
-            recordings, RecordingSet
-        )
+        if isinstance(recordings, RecordingSet):
+            self._recordings = recordings
+            self._recordings_path = None
+        else:
+            assert isinstance(recordings, (str, Path))
+            self._recordings = None
+            self._recordings_path = Path(recordings)
+
+        # self._recordings, self._recordings_paths = self._parse_dict_args(
+        #     recordings, RecordingSet
+        # )
 
         self._features, self._features_paths = self._parse_dict_args(
             features, FeatureSet
         )
         self._enrollments, self._enrollments_paths = self._parse_dict_args(
-            enrollments,
-            EnrollmentMap,
+            enrollments, EnrollmentMap,
         )
         self._trials, self._trials_paths = self._parse_dict_args(
-            trials,
-            (TrialKey, TrialNdx, SparseTrialKey),
+            trials, (TrialKey, TrialNdx, SparseTrialKey),
         )
 
         self.sparse_trials = sparse_trials
         self.table_sep = table_sep
+        self._files_to_delete = []
+
+    def get_dataset_files(self):
+        file_paths = []
+        for file_path in [self._segments_path, self._recordings_path]:
+            if file_path is not None:
+                file_paths.append(file_path)
+
+        for path_dict in [
+            self._features_paths,
+            self._enrollments_paths,
+            self._trials_paths,
+        ]:
+            if path_dict is None:
+                continue
+            for k, v in path_dict.items():
+                file_paths.append(v)
+
+        return file_paths
+
+    def _delete_files(self, dataset_dir):
+        if not self._files_to_delete:
+            return
+
+        dataset_files = self.get_dataset_files()
+        for file_path in self._files_to_delete:
+            file_path = Path(file_path)
+            # if the file has been added again we don't delete
+            if file_path in dataset_files:
+                continue
+
+            # if we are saving the dataset to another location
+            # we don't delete the one in the original
+            if file_path.parent == dataset_dir and file_path.is_file():
+                file_path.unlink()
 
     def _parse_dict_args(self, data, types):
         if data is None:
@@ -109,17 +151,38 @@ def segments(self, keep_loaded: bool = True):
 
         return self._segments
 
-    def recordings_value(self, key: str, keep_loaded: bool = True):
-        if self._recordings[key] is None:
-            assert self._recordings_paths[key] is not None
-            recordings = RecordingSet.load(
-                self._recordings_paths[key], sep=self.table_sep
-            )
+    def __len__(self):
+        return len(self.segments())
+
+    def recordings(self, keep_loaded: bool = True):
+        if self._recordings is None:
+            assert self._recordings_path is not None
+            recordings = RecordingSet.load(self._recordings_path, sep=self.table_sep)
             if keep_loaded:
-                self._recordings[key] = recordings
+                self._recordings = recordings
             return recordings
 
-        return self._recordings[key]
+        return self._recordings
+
+    # def recordings_value(self, key: str, keep_loaded: bool = True):
+    #     if self._recordings[key] is None:
+    #         assert self._recordings_paths[key] is not None
+    #         recordings = RecordingSet.load(
+    #             self._recordings_paths[key], sep=self.table_sep
+    #         )
+    #         if keep_loaded:
+    #             self._recordings[key] = recordings
+    #         return recordings
+
+    #     return self._recordings[key]
+
+    def features_keys(self):
+        if self._features is not None:
+            return self._features.keys()
+        elif self._features_path is not None:
+            return self._features_path.keys()
+        else:
+            return {}
 
     def features_value(self, key: str, keep_loaded: bool = True):
         if self._features[key] is None:
@@ -131,6 +194,14 @@ def features_value(self, key: str, keep_loaded: bool = True):
 
         return self._features[key]
 
+    def classes_keys(self):
+        if self._classes is not None:
+            return self._classes.keys()
+        elif self._classes_path is not None:
+            return self._classes_path.keys()
+        else:
+            return {}
+
     def classes_value(self, key: str, keep_loaded: bool = True):
         if self._classes[key] is None:
             assert self._classes_paths[key] is not None
@@ -170,12 +241,12 @@ def trials_value(self, key: str, keep_loaded: bool = True):
 
         return self._trials[key]
 
-    def recordings(self, keep_loaded: bool = True):
-        if self._recordings is None:
-            yield from ()
-        else:
-            for key in self._recordings.keys():
-                yield key, self.recordings_value(key, keep_loaded)
+    # def recordings(self, keep_loaded: bool = True):
+    #     if self._recordings is None:
+    #         yield from ()
+    #     else:
+    #         for key in self._recordings.keys():
+    #             yield key, self.recordings_value(key, keep_loaded)
 
     def features(self, keep_loaded: bool = True):
         if self._features is None:
@@ -299,7 +370,6 @@ def save_changed(
         dataset_path: PathLike,
         update_paths: bool = True,
         table_sep: Optional[str] = None,
-        force_save_all: bool = False,
     ):
         """Saves the tables that change in disk or tables
            that are not in the ouput directory.
@@ -330,24 +400,36 @@ def save_changed(
             if update_paths:
                 self._segments_path = file_path
 
-        if self._recordings is not None:
-            file_names = {}
-            for k in self._recordings.keys():
-                file_name = k + table_ext
-                file_names[k] = file_name
-                file_path = dataset_dir / file_name
-                if (
-                    self._recordings[k] is not None
-                    or file_path != self._recordings_paths[k]
-                    or not file_path.exists()
-                ):
-                    v = self.recordings_value(k, keep_loaded=False)
-                    v.save(file_path, sep=table_sep)
-                    if update_paths:
-                        self._recordings_paths[k] = file_path
-
-            if file_names:
-                dataset["recordings"] = file_names
+        file_name = f"recordings{table_ext}"
+        dataset["recordings"] = file_name
+        file_path = dataset_dir / file_name
+        if (
+            self._recordings is not None
+            or file_path != self._recordings_path
+            or not file_path.exists()
+        ):
+            self.recordings(keep_loaded=False).save(file_path, sep=table_sep)
+            if update_paths:
+                self._recordings_path = file_path
+
+        # if self._recordings is not None:
+        #     file_names = {}
+        #     for k in self._recordings.keys():
+        #         file_name = k + table_ext
+        #         file_names[k] = file_name
+        #         file_path = dataset_dir / file_name
+        #         if (
+        #             self._recordings[k] is not None
+        #             or file_path != self._recordings_paths[k]
+        #             or not file_path.exists()
+        #         ):
+        #             v = self.recordings_value(k, keep_loaded=False)
+        #             v.save(file_path, sep=table_sep)
+        #             if update_paths:
+        #                 self._recordings_paths[k] = file_path
+
+        #     if file_names:
+        #         dataset["recordings"] = file_names
 
         if self._features is not None:
             file_names = {}
@@ -428,6 +510,8 @@ def save_changed(
         with open(dataset_file, "w") as f:
             yaml.dump(dataset, f)
 
+        self._delete_files(dataset_dir)
+
     def save_all(
         self,
         dataset_path: PathLike,
@@ -457,17 +541,24 @@ def save_all(
         if update_paths:
             self._segments_path = file_path
 
-        file_names = {}
-        for k, v in self.recordings(keep_loaded=False):
-            file_name = k + table_ext
-            file_names[k] = file_name
-            file_path = dataset_dir / file_name
-            v.save(file_path, sep=table_sep)
-            if update_paths:
-                self._recordings_paths[k] = file_path
+        file_name = f"recordings{table_ext}"
+        dataset["recordings"] = file_name
+        file_path = dataset_dir / file_name
+        self.recordings(keep_loaded=False).save(file_path, sep=table_sep)
+        if update_paths:
+            self._recordings_path = file_path
 
-        if file_names:
-            dataset["recordings"] = file_names
+        # file_names = {}
+        # for k, v in self.recordings(keep_loaded=False):
+        #     file_name = k + table_ext
+        #     file_names[k] = file_name
+        #     file_path = dataset_dir / file_name
+        #     v.save(file_path, sep=table_sep)
+        #     if update_paths:
+        #         self._recordings_paths[k] = file_path
+
+        # if file_names:
+        #     dataset["recordings"] = file_names
 
         file_names = {}
         for k, v in self.features(keep_loaded=False):
@@ -520,10 +611,13 @@ def save_all(
         with open(dataset_file, "w") as f:
             yaml.dump(dataset, f)
 
+        self._delete_files(dataset_dir)
+
     def update_from_disk(self):
         self.segments()
-        for k, v in self.recordings():
-            pass
+        self.recordings()
+        # for k, v in self.recordings():
+        #     pass
 
         for k, v in self.features():
             pass
@@ -568,9 +662,10 @@ def load(
                 classes[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "recordings" in dataset:
-            recordings = {}
-            for k, v in dataset["recordings"].items():
-                recordings[k] = Dataset.resolve_file_path(dataset_dir, v)
+            recordings = Dataset.resolve_file_path(dataset_dir, dataset["recordings"])
+            # recordings = {}
+            # for k, v in dataset["recordings"].items():
+            #     recordings[k] = Dataset.resolve_file_path(dataset_dir, v)
 
         if "features" in dataset:
             features = {}
@@ -615,32 +710,42 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]
         else:
             raise ValueError()
 
-    def add_recordings(
-        self,
-        recordings_name: str,
-        recordings: Union[PathLike, RecordingSet],
+    def set_segments(
+        self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool,
     ):
-        if self._recordings is None:
-            self._recordings = {}
-            self._recordings_paths = {}
+        if isinstance(segments, (str, Path)):
+            self._segments = None
+            self._segments_path = segments
+        elif isinstance(segments, SegmentSet):
+            self._segments = segments
+            self._segments_path = None
+        else:
+            raise ValueError()
 
-        if isinstance(features, (str, Path)):
-            self._recordings[features_name] = None
-            self._recordings_paths[recordings_name] = recordings
+    def set_recordings(
+        self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool,
+    ):
+        if isinstance(recordings, (str, Path)):
+            self._recordings = None
+            self._recordings_path = Path(recordings)
         elif isinstance(recordings, RecordingSet):
-            self._recordings[recordings_name] = recordings
-            self._recordings_paths[recordings_name] = None
+            self._recordings = recordings
+            self._recordings_path = None
         else:
             raise ValueError()
 
+        if update_seg_durs:
+            rec_ids = self.segments(keep_loaded=True).recordings()
+            self.segments()["duration"] = self.recordings().loc[rec_ids, "duration"]
+
     def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]):
         if self._classes is None:
             self._classes = {}
             self._classes_paths = {}
 
         if isinstance(classes, (str, Path)):
-            self._classes[features_name] = None
-            self._classes_paths[classes_name] = classes
+            self._classes[classes_name] = None
+            self._classes_paths[classes_name] = Path(classes)
         elif isinstance(classes, ClassInfo):
             self._classes[classes_name] = classes
             self._classes_paths[classes_name] = None
@@ -648,9 +753,7 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]):
             raise ValueError()
 
     def add_enrollments(
-        self,
-        enrollments_name: str,
-        enrollments: Union[PathLike, EnrollmentMap],
+        self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap],
     ):
         if self._enrollments is None:
             self._enrollments = {}
@@ -658,7 +761,7 @@ def add_enrollments(
 
         if isinstance(enrollments, (str, Path)):
             self._enrollments[enrollments_name] = None
-            self._enrollments_paths[enrollments_name] = enrollments
+            self._enrollments_paths[enrollments_name] = Path(enrollments)
         elif isinstance(enrollments, EnrollmentMap):
             self._enrollments[enrollments_name] = enrollments
             self._enrollments_paths[enrollments_name] = None
@@ -675,8 +778,8 @@ def add_trials(
             self._trials_paths = {}
 
         if isinstance(trials, (str, Path)):
-            self._trials[features_name] = None
-            self._trials_paths[trials_name] = trials
+            self._trials[trials_name] = None
+            self._trials_paths[trials_name] = Path(trials)
         elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)):
             self._trials[trials_name] = trials
             self._trials_paths[trials_name] = None
@@ -685,85 +788,104 @@ def add_trials(
 
     def remove_features(self, features_name: str):
         if self._features_paths[features_name] is not None:
-            file_path = Path(self._features_paths[features_name])
-            if file_path.is_file():
-                file_path.unlink()
+            self._files_to_delete.append(self._features_paths[features_name])
 
         del self._features[features_name]
         del self._features_paths[features_name]
 
-    def remove_recordings(
-        self,
-        recordings_name: str,
-    ):
-        if self._recordingsr_paths[recordings_name] is not None:
-            file_path = Path(self._recordings_paths[recordings_name])
-            if file_path.is_file():
-                file_path.unlink()
+    def remove_recordings(self,):
+        if self._recordings_path is not None:
+            self._files_to_delete.append(self._recordings_path)
 
-        del self._recordings[recordings_name]
-        del self._recordings_paths[recordings_name]
+        self._recordings = None
+        self._recordings_path = None
+
+    # def remove_recordings(
+    #     self,
+    #     recordings_name: str,
+    # ):
+    #     if self._recordingsr_paths[recordings_name] is not None:
+    #         file_path = Path(self._recordings_paths[recordings_name])
+    #         if file_path.is_file():
+    #             file_path.unlink()
+
+    #     del self._recordings[recordings_name]
+    #     del self._recordings_paths[recordings_name]
 
     def remove_classes(self, classes_name: str):
         if self._classes_paths[classes_name] is not None:
-            file_path = Path(self._classes_paths[classes_name])
-            if file_path.is_file():
-                file_path.unlink()
+            self._files_to_delete.append(self._class_paths[class_name])
 
         del self._classes[classes_name]
         del self._classes_paths[classes_name]
 
     def remove_enrollments(
-        self,
-        enrollments_name: str,
+        self, enrollments_name: str,
     ):
         if self._enrollments_paths[enrollments_name] is not None:
-            file_path = Path(self._enrollments_paths[enrollments_name])
-            if file_path.is_file():
-                file_path.unlink()
+            self._files_to_delete.append(self._enrollments_paths[enrollments_name])
 
         del self._enrollments[enrollments_name]
         del self._enrollments_paths[enrollments_name]
 
     def remove_trials(
-        self,
-        trials_name: str,
+        self, trials_name: str,
     ):
         if self._trials_paths[trials_name] is not None:
-            file_path = Path(self._trials_paths[trials_name])
-            if file_path.is_file():
-                file_path.unlink()
+            self._files_to_delete.append(self._trials_paths[trials_name])
 
         del self._trials[trials_name]
         del self._trials_paths[trials_name]
 
-    def set_segments(self, segments: Union[PathLike, SegmentSet]):
-        if isinstance(segments, SegmentSet):
-            self._segments = segments
-        else:
-            self._segments_path = segments
+    def add_cols_to_segments(
+        self,
+        right_table: Union[InfoTable, pd.DataFrame, PathLike],
+        column_names: Union[None, str, List[str], np.ndarray] = None,
+        on: Union[str, List[str], np.ndarray] = "id",
+        right_on: Union[None, str, List[str], np.ndarray] = None,
+    ):
+        if isinstance(right_table, (str, Path)):
+            file_path = Path(right_table)
+            if file_path.is_file():
+                right_table = InfoTable.load(file_path)
+            else:
+                if right_table == "recordings":
+                    right_table = self.recordings()
+                elif right_table in self.features_keys():
+                    right_table = self.features_value(right_table)
+                elif right_table in self.classes_keys():
+                    right_table = self.classes_value
+                else:
+                    raise ValueError("%s not found", right_table)
+
+        segments = self.segments(keep_loaded=True)
+        segments.add_columns(right_table, column_names, on=on, right_on=right_on)
 
-    def clean(self):
-        rec_ids = self.segments().recording_ids()
-        for k, table in self.recordings():
-            table = table.loc[table["id"].isin(rec_ids)].copy()
-            self._recordings[k] = RecordingSet(table)
+    def clean(self, rebuild_class_idx=False):
+        rec_ids = self.segments().recordings()
+        # for k, table in self.recordings():
+        #     # table = table.loc[table["id"].isin(rec_ids)].copy()
+        #     # self._recordings[k] = RecordingSet(table)
+        self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids))
 
         ids = self.segments()["id"].values
         for k, table in self.features():
-            table = table.loc[table["id"].isin(ids)].copy()
-            self._features[k] = FeatureSet(table)
+            self._features[k] = table.filter(lambda df: df["id"].isin(ids))
+            # table = table.loc[table["id"].isin(ids)].copy()
+            # self._features[k] = FeatureSet(table)
 
         for k, table in self.classes():
             class_ids = self.segments()[k].unique()
-            table = table[table["id"].isin(class_ids)].copy()
-            self._classes[k] = ClassInfo(table)
+            self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids))
+            # table = table[table["id"].isin(class_ids)].copy()
+            # self._classes[k] = ClassInfo(table)
 
         remove_keys = []
         for k, table in self.enrollments():
-            table = table.loc[table["segmentid"].isin(ids)].copy()
+            # table = table.loc[table["segmentid"].isin(ids)].copy()
+            table = table.filter(lambda df: df["segmentid"].isin(ids))
             if len(table) > 0:
-                self._enrollments[k] = EnrollmentMap(table)
+                self._enrollments[k] = table
             else:
                 remove_keys.append(k)
 
@@ -790,7 +912,7 @@ def _split_into_trials_and_cohort(
         seed: int,
     ):
         # select test speakers
-        rng = np.random.RandomState(seed=seed)
+        rng = np.random.default_rng(seed=seed)
 
         spks = segments["speaker"].unique()
         trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False)
@@ -859,20 +981,14 @@ def split_into_trials_and_cohort(
             segments_male = SegmentSet(segments[segments["gender"] == "m"])
             segments_female = SegmentSet(segments[segments["gender"] == "f"])
             trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort(
-                segments_male,
-                num_tar_trials,
-                num_trial_speakers,
-                seed,
+                segments_male, num_tar_trials, num_trial_speakers, seed,
             )
             (
                 trials_female,
                 enroll_female,
                 cohort_female,
             ) = self._split_into_trials_and_cohort(
-                segments_female,
-                num_tar_trials,
-                num_trial_speakers,
-                seed,
+                segments_female, num_tar_trials, num_trial_speakers, seed,
             )
             trials = TrialKey.merge([trials_male, trials_female])
             enroll = EnrollmentMap.cat([enroll_male, enroll_female])
@@ -880,10 +996,7 @@ def split_into_trials_and_cohort(
         else:
             segments = self.segments()
             trials, enroll, cohort = self._split_into_trials_and_cohort(
-                segments,
-                num_tar_trials,
-                num_trial_speakers,
-                seed,
+                segments, num_tar_trials, num_trial_speakers, seed,
             )
 
         dataset_trials = self.clone()
@@ -899,3 +1012,176 @@ def split_into_trials_and_cohort(
         dataset_cohort.clean()
 
         return dataset_trials, dataset_cohort
+
+    def remove_short_segments(self, min_length: float, length_name: str = "duration"):
+        segments = self.segments()
+        self._segments = segments.filter(lambda df: df[length_name] >= min_length)
+        self.clean()
+
+    def remove_classes_few_segments(
+        self, class_name: str, min_segs: int, rebuild_idx: bool = False,
+    ):
+        segments = self.segments()
+        classes, counts = np.unique(segments[class_name], return_counts=True)
+        keep_classes = classes[counts >= min_segs]
+        self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes))
+        self.clean()
+        if rebuild_idx:
+            class_info = self.classes_value(class_name)
+            class_info.add_class_idx()
+
+    def rebuild_class_idx(self, class_name: str):
+        class_info = self.classes_value(class_name)
+        class_info.add_class_idx()
+
+    def _segments_split(self, val_prob: float, rng: np.random.Generator):
+        segments = self.segments()
+        p = rng.permutation(len(segments))
+        num_train = int(round((1 - val_prob) * len(p)))
+
+        train_idx = p[:num_train]
+        train_segs = segments.filter(iindex=train_idx)
+        train_segs.sort()
+
+        val_idx = p[num_train:]
+        val_segs = segments.filter(iindex=val_idx)
+        val_segs.sort()
+
+        return train_segs, val_segs
+
+    def _segments_split_joint_classes(
+        self,
+        val_prob: float,
+        joint_classes: List[str],
+        min_train_samples: int,
+        rng: np.random.Generator,
+    ):
+        segments = self.segments()
+        classes = segments[joint_classes].apply("-".join, axis=1)
+        u_classes, class_ids = np.unique(classes, return_inverse=True)
+        train_mask = np.zeros(len(segments), dtype=bool)
+        kk = 0
+        for c_id in range(len(u_classes)):
+            idx = (class_ids == c_id).nonzero()[0]
+            count = len(idx)
+            p = rng.permutation(count)
+            num_train = max(
+                int(round((1 - val_prob) * count)), min(min_train_samples, count)
+            )
+            kk += count - num_train
+            train_idx = idx[p[:num_train]]
+            train_mask[train_idx] = True
+
+        train_idx = train_mask.nonzero()[0]
+        train_segs = segments.filter(iindex=train_idx)
+        train_segs.sort()
+
+        val_segs = segments.filter(iindex=train_idx, keep=False)
+        val_segs.sort()
+
+        return train_segs, val_segs
+
+    def _segments_split_disjoint_classes(
+        self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator,
+    ):
+        segments = self.segments()
+        classes = segments[disjoint_classes].apply("-".join, axis=1)
+        u_classes, class_ids = np.unique(classes, return_inverse=True)
+        p = rng.permutation(len(u_classes))
+        class_ids = p[class_ids]
+        num_train = int(round((1 - val_prob) * len(segments)))
+        train_mask = np.zeros(len(segments), dtype=bool)
+        count_acc = 0
+        for c_id in range(len(u_classes)):
+            idx = (class_ids == c_id).nonzero()[0]
+            train_mask[idx] = True
+            count = len(idx)
+            count_acc += count
+            if count_acc >= num_train:
+                break
+
+        train_idx = train_mask.nonzero()[0]
+        train_segs = segments.filter(iindex=train_idx)
+        train_segs.sort()
+
+        val_segs = segments.filter(iindex=train_idx, keep=False)
+        val_segs.sort()
+
+        return train_segs, val_segs
+
+    def _segments_split_joint_and_disjoint_classes(
+        self,
+        val_prob: float,
+        joint_classes: List[str],
+        disjoint_clases: List[str],
+        min_train_samples: int,
+        rng: np.random.Generator,
+    ):
+        raise NotImplementedError("I'll implement this when I need it")
+        segments = self.segments()
+        j_classes = segments[joint_classes].apply("-".join, axis=1)
+        ju_classes, j_class_ids = np.unique(j_classes, return_inverse=True)
+        d_classes = segments[disjoint_classes].apply("-".join, axis=1)
+        du_classes, d_class_ids = np.unique(d_classes, return_inverse=True)
+        d_p = rng.permutation(len(du_classes))
+        d_class_ids = d_p[d_class_ids]
+        d_sort_idx = np.argsort(d_class_ids)
+        d_sort_j_class_ids = j_class_ids[d_sort_idx]
+
+        train_d_classes = set()
+        for c_id in range(len(ju_classes)):
+            idx = (j_sort_class_ids == c_id).nonzero()[0]
+            count = len(idx)
+            num_train = max(
+                int(round((1 - val_prob) * count)), min(min_train_samples, count)
+            )
+            sel_d_class_ids = set(d_sort_idx[:num_train])
+            train_d_classes = train_d_classes.union(sel_d_class_ids)
+
+        train_mask = np.zeros(len(segments), dtype=bool)
+        for c_id in train_d_classes:
+            mask = d_class_ids == c_id
+            train_mask[mask] = True
+
+        train_idx = train_mask.nonzero()[0]
+        train_segs = segments.filter(iindex=train_idx)
+        train_segs.sort()
+
+        val_segs = segments.filter(iindex=train_idx, keep=False)
+        val_segs.sort()
+
+        return train_segs, val_segs
+
+    def split_train_val(
+        self,
+        val_prob: float,
+        joint_classes: Optional[List[str]] = None,
+        disjoint_classes: Optional[List[str]] = None,
+        min_train_samples: int = 1,
+        seed: int = 11235813,
+    ):
+        rng = np.random.default_rng(seed)
+        if joint_classes is None and disjoint_classes is None:
+            train_segs, val_segs = self._segments_split(val_prob, rng)
+        elif joint_classes is not None and disjoint_classes is None:
+            train_segs, val_segs = self._segments_split_joint_classes(
+                val_prob, joint_classes, min_train_samples, rng,
+            )
+        elif joint_classes is None and disjoint_classes is not None:
+            train_segs, val_segs = self._segments_split_disjoint_classes(
+                val_prob, disjoint_classes, rng,
+            )
+        else:
+            train_segs, val_segs = self._segments_split_joint_and_disjoint_classes(
+                val_prob, joint_classes, disjoint_classes, min_train_samples, rng,
+            )
+
+        train_ds = self.clone()
+        train_ds.set_segments(train_segs)
+        train_ds.clean()
+
+        val_ds = self.clone()
+        val_ds.set_segments(val_segs)
+        val_ds.clean()
+
+        return train_ds, val_ds
diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py
index f22263cf..80b818d6 100644
--- a/hyperion/utils/fold_list.py
+++ b/hyperion/utils/fold_list.py
@@ -176,7 +176,7 @@ def create(
           FoldList object.
         """
         if shuffle:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
 
         if group_by_key is None:
             group_by_key = segment_key
diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py
index 45eab05f..57f3faf2 100644
--- a/hyperion/utils/info_table.py
+++ b/hyperion/utils/info_table.py
@@ -8,6 +8,7 @@
 from collections import OrderedDict
 from copy import deepcopy
 from pathlib import Path
+from typing import Optional, Union, List
 
 import numpy as np
 import pandas as pd
@@ -192,14 +193,41 @@ def cat(cls, tables):
         ].is_unique, """there are duplicated ids in the tables we are concatenating"""
         return cls(df)
 
-    def filter(self, items=None, iindex=None, columns=None, by="id", keep=True):
+    def filter(
+        self, predicate=None, items=None, iindex=None, columns=None, by="id", keep=True
+    ):
+        """Filters the table and produce a new table with the elements to keep
+
+        Args:
+          predicate: callable function that defines the filtering criterion e.g.:
+            lambda df: df["duration"] > 1.0.
+          items: filters the table based in column value with pandas command:
+            df.loc[items, by], used only if predicate is None
+          iindex: filters the table based on integer index with pandas command:
+            df.iloc[iiindex], used if predicate and items are None
+          columns: columns to keep of remove.
+          by: column id to use with itmes criterion
+          keep: if True, the criterion is used to keep rows, if False it is used
+            to remove rows
+
+        Returns
+          InfoTable of the same class as the input.
+        """
         assert (
-            items is None or iindex is None
-        ), "items and iindex cannot be not None at the same time"
+            predicate is not None
+            or items is not None
+            or iindex is not None
+            or columns is not None
+        ), "predicate, items, iindex and columns cannot be not None at the same time"
         df = self.df
 
+        if predicate is not None:
+            mask = predicate(self.df)
+
         if not keep:
-            if items is not None:
+            if predicate is not None:
+                mask = np.logical_not(mask)
+            elif items is not None:
                 items = np.setdiff1d(df[by], items)
             elif iindex is not None:
                 iindex = np.setdiff1d(np.arange(len(df)), iindex)
@@ -207,7 +235,12 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True):
             if columns is not None:
                 columns = np.setdiff1d(df.columns, columns)
 
-        if items is not None:
+        if predicate is not None:
+            if columns is None:
+                df = df.loc[mask]
+            else:
+                df = df.loc[mask, columns]
+        elif items is not None:
             if by != "id":
                 missing = [False if v in df[by] else True for v in items]
                 if any(missing):
@@ -225,7 +258,7 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True):
             if columns is not None:
                 df = df[columns]
 
-        return self.__class__(df)
+        return self.__class__(df.copy())
 
     def __eq__(self, other):
         """Equal operator"""
@@ -255,7 +288,7 @@ def shuffle(self, seed=1024, rng=None):
           Index used to shuffle the list.
         """
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
         index = np.arange(len(self.df))
         rng.shuffle(index)
         self.df = self.df.iloc[index]
@@ -279,14 +312,33 @@ def get_loc(self, keys):
         loc = self.df.index.get_loc(keys)
         if isinstance(loc, int):
             return loc
-        elif isinstance(loc, np.ndarray) and loc.dtype == np.bool:
+
+        if isinstance(loc, np.ndarray) and loc.dtype == np.bool:
             return np.nonzero(loc)[0]
-        else:
-            return list(range(loc.start, loc.stop, loc.step))
+
+        return list(range(loc.start, loc.stop, loc.step))
 
     def get_col_idx(self, keys):
         return self.df.columns.get_loc(keys)
 
+    def add_columns(
+        self,
+        right_table,
+        column_names: Union[None, str, List[str], np.ndarray] = None,
+        on: Union[str, List[str], np.ndarray] = "id",
+        right_on: Union[None, str, List[str], np.ndarray] = None,
+    ):
+        if isinstance(right_table, InfoTable):
+            right_table = right_table.df
+
+        if column_names is not None:
+            right_table = right_table[column_names]
+
+        if right_on is None:
+            right_on = on
+
+        self.df = self.df.merge(right_table, how="left", left_on=on, right_on=right_on)
+
         # def __len__(self):
 
     #     """Returns the number of elements in the list."""
diff --git a/hyperion/utils/math.py b/hyperion/utils/math_funcs.py
similarity index 93%
rename from hyperion/utils/math.py
rename to hyperion/utils/math_funcs.py
index 84596f7d..5ee510b9 100644
--- a/hyperion/utils/math.py
+++ b/hyperion/utils/math_funcs.py
@@ -346,10 +346,26 @@ def int2onehot(class_ids, num_classes=None):
     return p
 
 
-def cosine_scoring(x1, x2):
+def average_vectors(x, ids):
+    assert x.shape[0] == len(ids)
+    num_ids = np.max(ids) + 1
+    x_avg = np.zeros((num_ids, x.shape[1]), dtype=x.dtype)
+    for i in range(num_ids):
+        mask = ids == i
+        x_avg[i] = np.mean(x[mask], axis=0)
 
-    l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True))
-    l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True))
+    return x_avg
+
+
+def cosine_scoring(x1, x2, ids1=None, ids2=None):
+    if ids1 is not None:
+        x1 = average_vectors(x1, ids1)
+
+    if ids2 is not None:
+        x2 = average_vectors(x2, ids2)
+
+    l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True) + 1e-10)
+    l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True) + 1e-10)
     x1 = x1 / l2_1
     x2 = x2 / l2_2
 
diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py
index 2341beb4..ec617975 100644
--- a/hyperion/utils/plotting.py
+++ b/hyperion/utils/plotting.py
@@ -4,6 +4,7 @@
 """
 
 import matplotlib
+
 # matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import numpy as np
@@ -11,7 +12,7 @@
 import scipy.stats as stats
 from mpl_toolkits.mplot3d import Axes3D as plt3d
 
-from .math import invert_pdmat
+from .math_funcs import invert_pdmat
 
 
 def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs):
diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py
index 5abf76f2..070e4f53 100644
--- a/hyperion/utils/scp_list.py
+++ b/hyperion/utils/scp_list.py
@@ -384,7 +384,7 @@ def shuffle(self, seed=1024, rng=None):
           Index used to shuffle the list.
         """
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
         index = np.arange(len(self.key))
         rng.shuffle(index)
 
diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py
index 6aef5bb2..a99b4e1e 100644
--- a/hyperion/utils/segment_set.py
+++ b/hyperion/utils/segment_set.py
@@ -13,42 +13,48 @@ class SegmentSet(InfoTable):
 
     def __init__(self, df):
         super().__init__(df)
-        if "start" in df and "recording_id" not in df:
-            df["recording_id"] = df["id"]
+        if "start" in df and "recordings" not in df:
+            df["recordings"] = df["id"]
 
-        if "start" not in df and "recording_id" in df:
+        if "start" not in df and "recordings" in df:
             df["start"] = 0.0
 
     @property
     def has_time_marks(self):
-        return (
-            "recording_id" in self.df and "start" in self.df and "duration" in self.df
-        )
+        return "recordings" in self.df and "start" in self.df and "duration" in self.df
 
     @property
     def has_recording_ids(self):
-        return "recording_id" in self.df
+        return "recordings" in self.df
 
-    def recording_ids(self, ids=None):
+    @property
+    def has_recordings(self):
+        return "recordings" in self.df
+
+    def recordings(self, ids=None):
         if ids is None:
-            if "recording_id" in self.df:
-                return self.df["recording_id"]
+            if "recordings" in self.df:
+                return self.df["recordings"]
             else:
                 return self.df["id"]
 
-        if "recording_id" in self.df:
-            return self.df.loc[ids, "recording_id"]
+        if "recordings" in self.df:
+            return self.df.loc[ids, "recordings"]
 
         return ids
 
-    def recording_time_marks(self, ids):
-        if "recording" in self.df:
-            rec_col = "recording_id"
-        else:
-            rec_col = "id"
+    def recording_ids(self, ids=None):
+        return self.recordings(ids)
+
+    def recording_time_marks(self, ids, recordings_name: str = "recordings"):
+        if recordings_name == "recordings":
+            if "recordings" in self.df:
+                recordings_name = "recordings"
+            else:
+                recordings_name = "id"
 
         assert "duration" in self.df
         if "start" not in self.df:
             self.df["start"] = 0.0
 
-        return self.df.loc[ids, [rec_col, "start", "duration"]]
+        return self.df.loc[ids, [recordings_name, "start", "duration"]]
diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py
index 1bc321a7..62fcd446 100644
--- a/hyperion/utils/sparse_trial_key.py
+++ b/hyperion/utils/sparse_trial_key.py
@@ -145,7 +145,7 @@ def load_table(cls, file_path, sep=None):
           file_path: File to read the list.
 
         Returns:
-          TrialKey object.
+          SparseTrialKey object.
         """
         file_path = Path(file_path)
         ext = file_path.suffix
@@ -156,19 +156,15 @@ def load_table(cls, file_path, sep=None):
         models = df["modelid"].values
         segments = df["segmentid"].values
         is_tar = (df["targettype"] == "target").values
-        model_set, _, model_idx = np.unique(
-            models, return_index=True, return_inverse=True
-        )
-        seg_set, _, seg_idx = np.unique(
-            segments, return_index=True, return_inverse=True
-        )
+        model_set, model_idx = np.unique(models, return_inverse=True)
+        seg_set, seg_idx = np.unique(segments, return_inverse=True)
         tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool")
         non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool")
-        for item in zip(model_idx, seg_idx, is_tar):
-            if item[2]:
-                tar[item[0], item[1]] = True
+        for i, j, target_type in zip(model_idx, seg_idx, is_tar):
+            if target_type:
+                tar[i, j] = True
             else:
-                non[item[0], item[1]] = True
+                non[i, j] = True
         return cls(model_set, seg_set, tar.tocsr(), non.tocsr())
 
     @classmethod
diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py
index 7ed9a1d1..760bd1f1 100644
--- a/hyperion/utils/sparse_trial_scores.py
+++ b/hyperion/utils/sparse_trial_scores.py
@@ -3,12 +3,12 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-
 import copy
 import logging
-import os.path as path
+from pathlib import Path
 
 import numpy as np
+import pandas as pd
 import scipy.sparse as sparse
 
 from ..hyp_defs import float_cpu
@@ -18,9 +18,6 @@
 from .trial_ndx import TrialNdx
 from .trial_scores import TrialScores
 
-# import h5py
-
-
 
 class SparseTrialScores(TrialScores):
 
@@ -55,6 +52,26 @@ def save_txt(self, file_path):
                     % (self.model_set[r], self.seg_set[c], self.scores[r, c])
                 )
 
+    def save_table(self, file_path, sep=None):
+        """Saves object to pandas tabnle file.
+
+        Args:
+          file_path: File to write the list.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        self.score_mask.eliminate_zeros()
+        score_mask = self.score_mask.tocoo()
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"modelid{sep}segmentid{sep}LLR\n")
+            for i, j in zip(score_mask.row, score_mask.col):
+                f.write(
+                    f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n"
+                )
+
     @classmethod
     def load_h5(cls, file_path):
         raise NotImplementedError()
@@ -90,6 +107,35 @@ def load_txt(cls, file_path):
             scores[item[0], item[1]] = item[2]
         return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr())
 
+    @classmethod
+    def load_table(cls, file_path, sep=None):
+        """Loads object from pandas table file
+
+        Args:
+          file_path: File to read the list.
+
+        Returns:
+          TrialScores object.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        df = pd.read_csv(file_path, sep=sep)
+        models = df["modelid"].values
+        segments = df["segmentid"].values
+        score_list = df["LLR"].values
+        model_set, model_idx = np.unique(models, return_inverse=True)
+        seg_set, seg_idx = np.unique(segments, return_inverse=True)
+        scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu())
+        score_mask = sparse.lil_matrix(scores.shape, dtype="bool")
+        for i, j, score in zip(model_idx, seg_idx, score_list):
+            score_mask[i, j] = True
+            scores[i, j] = score
+
+        return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr())
+
     @classmethod
     def merge(cls, scr_list):
         raise NotImplementedError()
@@ -160,9 +206,9 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
 
         if not (np.all(f_mod) and np.all(f_seg)):
             for i in (f_mod == 0).nonzero()[0]:
-                logging.info("model %s not found" % model_set[i])
+                logging.info("model %s not found", model_set[i])
             for i in (f_seg == 0).nonzero()[0]:
-                logging.info("segment %s not found" % seg_set[i])
+                logging.info("segment %s not found", seg_set[i])
             if raise_missing:
                 raise Exception("some scores were not computed")
 
@@ -172,18 +218,36 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
         scores = self.scores.tocoo()
         new_data = scores.data
         new_row = scores.row.copy()
+        # for i, r in enumerate(mod_idx):
+        #     if f_mod[i] and i != r:
+        #         idx = scores.row == r
+        #         new_row[idx] = i
+
+        # new_col = scores.col.copy()
+        # for j, c in enumerate(seg_idx):
+        #     if f_seg[j] and j != c:
+        #         idx = scores.col == c
+        #         new_col[idx] = j
+
+        # idx = np.logical_and(new_row < num_mod, new_col < num_seg)
+        # if not np.all(idx):
+        #     new_data = new_data[idx]
+        #     new_row = new_row[idx]
+        #     new_col = new_col[idx]
+
+        new_row = -1 * np.ones_like(scores.row)
         for i, r in enumerate(mod_idx):
-            if f_mod[i] and i != r:
+            if f_mod[i]:
                 idx = scores.row == r
                 new_row[idx] = i
 
-        new_col = scores.col.copy()
+        new_col = -1 * np.ones_like(scores.col)
         for j, c in enumerate(seg_idx):
-            if f_seg[j] and j != c:
+            if f_seg[j]:
                 idx = scores.col == c
                 new_col[idx] = j
 
-        idx = np.logical_and(new_row < num_mod, new_col < num_seg)
+        idx = np.logical_and(new_row != -1, new_col != -1)
         if not np.all(idx):
             new_data = new_data[idx]
             new_row = new_row[idx]
@@ -193,19 +257,37 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
 
         score_mask = self.score_mask.tocoo()
         new_data = score_mask.data
-        new_row = score_mask.row.copy()
+        # new_row = score_mask.row.copy()
+        # for i, r in enumerate(mod_idx):
+        #     if f_mod[i] and i != r:
+        #         idx = score_mask.row == r
+        #         new_row[idx] = i
+
+        # new_col = score_mask.col.copy()
+        # for j, c in enumerate(seg_idx):
+        #     if f_seg[j] and j != c:
+        #         idx = score_mask.col == c
+        #         new_col[idx] = j
+
+        # idx = np.logical_and(new_row < num_mod, new_col < num_seg)
+        # if not np.all(idx):
+        #     new_data = new_data[idx]
+        #     new_row = new_row[idx]
+        #     new_col = new_col[idx]
+
+        new_row = -1 * np.ones_like(score_mask.row)
         for i, r in enumerate(mod_idx):
-            if f_mod[i] and i != r:
+            if f_mod[i]:
                 idx = score_mask.row == r
                 new_row[idx] = i
 
-        new_col = score_mask.col.copy()
+        new_col = -1 * np.ones_like(score_mask.col)
         for j, c in enumerate(seg_idx):
-            if f_seg[j] and j != c:
+            if f_seg[j]:
                 idx = score_mask.col == c
                 new_col[idx] = j
 
-        idx = np.logical_and(new_row < num_mod, new_col < num_seg)
+        idx = np.logical_and(new_row != -1, new_col != -1)
         if not np.all(idx):
             new_data = new_data[idx]
             new_row = new_row[idx]
@@ -249,7 +331,7 @@ def align_with_ndx(self, ndx, raise_missing=True):
             if not scr.score_mask[r, c]:
                 missing_scores = True
                 logging.info(
-                    "missing-scores for %s %s" % (scr.model_set[r], scr.seg_set[c])
+                    "missing-scores for %s %s", scr.model_set[r], scr.seg_set[c]
                 )
 
         if missing_scores and raise_missing:
@@ -291,7 +373,7 @@ def set_valid_scores(self, scores, ndx=None):
             self.scores = scr.scores
             self.score_mat = scr.score_mat
 
-        self.scores[self.score_mask]=scores
+        self.scores[self.score_mask] = scores
 
     @classmethod
     def from_trial_scores(cls, scr):
@@ -302,6 +384,12 @@ def from_trial_scores(cls, scr):
         score_mask.eliminate_zeros()
         return cls(scr.model_set, scr.seg_set, scores, score_mask)
 
+    def to_trial_scores(self):
+        scores = self.scores.toarray("C")
+        score_mask = self.score_mask.toarray("C")
+        # scores[~score_mask] = 0.0
+        return TrialScores(self.model_set, self.seg_set, scores, score_mask)
+
     def set_missing_to_value(self, ndx, val):
         """Aligns the scores with a TrialNdx and sets the trials with missing
         scores to the same value.
diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py
index fd17e240..cbccf093 100644
--- a/hyperion/utils/train_val_eval_list.py
+++ b/hyperion/utils/train_val_eval_list.py
@@ -207,7 +207,7 @@ def create(
                 part_names = ["train", "eval"]
 
         if shuffle:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
 
         if group_by_key is None:
             group_by_key = segment_key
diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py
index 4a99461b..5d8019b6 100644
--- a/hyperion/utils/trial_key.py
+++ b/hyperion/utils/trial_key.py
@@ -11,7 +11,8 @@
 import numpy as np
 import pandas as pd
 
-from .list_utils import *
+# from .list_utils import *
+from .list_utils import sort, intersect, ismember, split_list, list2ndarray
 from .trial_ndx import TrialNdx
 
 
@@ -178,7 +179,8 @@ def load(cls, file_path, sep=None):
         Returns:
           TrialKey object.
         """
-        _, file_ext = path.splitext(file_path)
+        file_path = Path(file_path)
+        file_ext = file_path.suffix
         if file_ext in (".h5", ".hdf5"):
             return cls.load_h5(file_path)
         elif file_ext in ("", ".txt"):
@@ -268,7 +270,7 @@ def load_txt(cls, file_path):
 
     @classmethod
     def load_table(cls, file_path, sep=None):
-        """Loads object from txt file
+        """Loads object from pandas table file
 
         Args:
           file_path: File to read the list.
@@ -285,12 +287,8 @@ def load_table(cls, file_path, sep=None):
         models = df["modelid"].values
         segments = df["segmentid"].values
         is_tar = (df["targettype"] == "target").values
-        model_set, _, model_idx = np.unique(
-            models, return_index=True, return_inverse=True
-        )
-        seg_set, _, seg_idx = np.unique(
-            segments, return_index=True, return_inverse=True
-        )
+        model_set, model_idx = np.unique(models, return_inverse=True)
+        seg_set, seg_idx = np.unique(segments, return_inverse=True)
         tar = np.zeros((len(model_set), len(seg_set)), dtype="bool")
         non = np.zeros((len(model_set), len(seg_set)), dtype="bool")
         for i, j, target_type in zip(model_idx, seg_idx, is_tar):
diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py
index e26d19e2..b7b873df 100644
--- a/hyperion/utils/trial_ndx.py
+++ b/hyperion/utils/trial_ndx.py
@@ -4,12 +4,14 @@
 """
 
 import copy
-import os.path as path
+from pathlib import Path
 
 import h5py
 import numpy as np
+import pandas as pd
 
-from .list_utils import *
+# from .list_utils import *
+from .list_utils import sort, intersect, ismember, split_list, list2ndarray
 
 
 class TrialNdx(object):
@@ -46,17 +48,20 @@ def sort(self):
         self.seg_set, s_idx = sort(self.seg_set, return_index=True)
         self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)]
 
-    def save(self, file_path):
+    def save(self, file_path, sep=None):
         """Saves object to txt/h5 file.
 
         Args:
           file_path: File to write the list.
         """
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        file_path = Path(file_path)
+        file_ext = file_path.suffix
+        if file_ext in [".h5", ".hdf5"]:
             self.save_h5(file_path)
-        else:
+        elif file_ext in [".txt", ""]:
             self.save_txt(file_path)
+        else:
+            self.save_table(file_path, sep=sep)
 
     def save_h5(self, file_path):
         """Saves object to h5 file.
@@ -71,15 +76,6 @@ def save_h5(self, file_path):
             f.create_dataset("ID/column_ids", data=seg_set)
             f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8"))
 
-            # model_set = self.model_set.astype('S')
-            # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype)
-            # f['ID/row_ids'] = model_set
-            # seg_set = self.seg_set.astype('S')
-            # f.create_dataset('ID/column_ids', self.seg_set.shape, dtype=seg_set.dtype)
-            # f['ID/column_ids'] = seg_set
-            # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8')
-            # f['trial_mask'] = self.trial_mask.astype('uint8')
-
     def save_txt(self, file_path):
         """Saves object to txt file.
 
@@ -91,8 +87,25 @@ def save_txt(self, file_path):
             for item in zip(idx[0], idx[1]):
                 f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]]))
 
+    def save_table(self, file_path, sep=None):
+        """Saves object to pandas tabnle file.
+
+        Args:
+          file_path: File to write the list.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"modelid{sep}segmentid{sep}\n")
+            I, J = self.trial_mask.nonzero()
+            for i, j in zip(I, J):
+                f.write(f"{self.model_set[i]}{sep}{self.seg_set[j]}\n")
+
     @classmethod
-    def load(cls, file_path):
+    def load(cls, file_path, sep=None):
         """Loads object from txt/h5 file
 
         Args:
@@ -101,11 +114,14 @@ def load(cls, file_path):
         Returns:
           TrialNdx object.
         """
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        file_path = Path(file_path)
+        file_ext = file_path.suffix
+        if file_ext in (".h5", ".hdf5"):
             return cls.load_h5(file_path)
-        else:
+        elif file_ext in ("", ".txt"):
             return cls.load_txt(file_path)
+        else:
+            return cls.load_table(file_path, sep)
 
     @classmethod
     def load_h5(cls, file_path):
@@ -148,6 +164,36 @@ def load_txt(cls, file_path):
             trial_mask[item[0], item[1]] = True
         return cls(model_set, seg_set, trial_mask)
 
+    @classmethod
+    def load_table(cls, file_path, sep=None):
+        """Loads object from pandas table file
+
+        Args:
+          file_path: File to read the list.
+
+        Returns:
+          TrialNdx object.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        df = pd.read_csv(file_path, sep=sep)
+        models = df["modelid"].values
+        segments = df["segmentid"].values
+        model_set, _, model_idx = np.unique(
+            models, return_index=True, return_inverse=True
+        )
+        seg_set, _, seg_idx = np.unique(
+            segments, return_index=True, return_inverse=True
+        )
+        trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool")
+        for i, j in zip(model_idx, seg_idx):
+            trial_mask[i, j] = True
+
+        return cls(model_set, seg_set, trial_mask)
+
     @classmethod
     def merge(cls, ndx_list):
         """Merges several index objects.
diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py
index a486647d..9e7fcd5d 100644
--- a/hyperion/utils/trial_scores.py
+++ b/hyperion/utils/trial_scores.py
@@ -3,16 +3,18 @@
  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 """
 
-
 import copy
 import logging
-import os.path as path
+from pathlib import Path
 
 import h5py
 import numpy as np
+import pandas as pd
 
 from ..hyp_defs import float_cpu
-from .list_utils import *
+
+# from .list_utils import *
+from .list_utils import sort, intersect, ismember, split_list, list2ndarray
 from .trial_key import TrialKey
 from .trial_ndx import TrialNdx
 
@@ -56,17 +58,20 @@ def sort(self):
         self.scores = self.scores[ix]
         self.score_mask = self.score_mask[ix]
 
-    def save(self, file_path):
+    def save(self, file_path, sep=None):
         """Saves object to txt/h5 file.
 
         Args:
           file_path: File to write the list.
         """
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        file_path = Path(file_path)
+        file_ext = file_path.suffix
+        if file_ext in [".h5", ".hdf5"]:
             self.save_h5(file_path)
-        else:
+        elif file_ext in ["", ".txt"]:
             self.save_txt(file_path)
+        else:
+            self.save_table(file_path, sep=sep)
 
     def save_h5(self, file_path):
         """Saves object to h5 file.
@@ -100,8 +105,27 @@ def save_txt(self, file_path):
                     )
                 )
 
+    def save_table(self, file_path, sep=None):
+        """Saves object to pandas tabnle file.
+
+        Args:
+          file_path: File to write the list.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"modelid{sep}segmentid{sep}LLR\n")
+            I, J = self.score_mask.nonzero()
+            for i, j in zip(I, J):
+                f.write(
+                    f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n"
+                )
+
     @classmethod
-    def load(cls, file_path):
+    def load(cls, file_path, sep=None):
         """Loads object from txt/h5 file
 
         Args:
@@ -110,11 +134,14 @@ def load(cls, file_path):
         Returns:
           TrialScores object.
         """
-        file_base, file_ext = path.splitext(file_path)
-        if file_ext == ".h5" or file_ext == ".hdf5":
+        file_path = Path(file_path)
+        file_ext = file_path.suffix
+        if file_ext in (".h5", ".hdf5"):
             return cls.load_h5(file_path)
-        else:
+        elif file_ext in ("", ".txt"):
             return cls.load_txt(file_path)
+        else:
+            return cls.load_table(file_path, sep)
 
     @classmethod
     def load_h5(cls, file_path):
@@ -163,6 +190,35 @@ def load_txt(cls, file_path):
             scores[item[0], item[1]] = item[2]
         return cls(model_set, seg_set, scores, score_mask)
 
+    @classmethod
+    def load_table(cls, file_path, sep=None):
+        """Loads object from pandas table file
+
+        Args:
+          file_path: File to read the list.
+
+        Returns:
+          TrialScores object.
+        """
+        file_path = Path(file_path)
+        ext = file_path.suffix
+        if sep is None:
+            sep = "\t" if ".tsv" in ext else ","
+
+        df = pd.read_csv(file_path, sep=sep)
+        models = df["modelid"].values
+        segments = df["segmentid"].values
+        score_list = df["LLR"].values
+        model_set, model_idx = np.unique(models, return_inverse=True)
+        seg_set, seg_idx = np.unique(segments, return_inverse=True)
+        score_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool")
+        scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu())
+        for i, j, score in zip(model_idx, seg_idx, score_list):
+            score_mask[i, j] = True
+            scores[i, j] = score
+
+        return cls(model_set, seg_set, scores, score_mask)
+
     @classmethod
     def merge(cls, scr_list):
         """Merges several score objects.
@@ -235,7 +291,7 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
           Filtered TrialScores object.
         """
 
-        if not (keep):
+        if not keep:
             model_set = np.setdiff1d(self.model_set, model_set)
             seg_set = np.setdiff1d(self.model_set, seg_set)
 
@@ -244,15 +300,15 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
 
         if np.all(f_mod) and np.all(f_seg):
             model_set = self.model_set[mod_idx]
-            set_set = self.seg_set[seg_idx]
+            seg_set = self.seg_set[seg_idx]
             ix = np.ix_(mod_idx, seg_idx)
             scores = self.scores[ix]
             score_mask = self.score_mask[ix]
         else:
             for i in (f_mod == 0).nonzero()[0]:
-                logging.info("model %s not found" % model_set[i])
+                logging.info("model %s not found", model_set[i])
             for i in (f_seg == 0).nonzero()[0]:
-                logging.info("segment %s not found" % seg_set[i])
+                logging.info("segment %s not found", seg_set[i])
             if raise_missing:
                 raise Exception("some scores were not computed")
 
diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py
index edf2c23a..c1c429f2 100644
--- a/hyperion/utils/utt2info.py
+++ b/hyperion/utils/utt2info.py
@@ -261,7 +261,7 @@ def shuffle(self, seed=1024, rng=None):
           Index used to shuffle the list.
         """
         if rng is None:
-            rng = np.random.RandomState(seed=seed)
+            rng = np.random.default_rng(seed=seed)
         index = np.arange(len(self.key))
         rng.shuffle(index)
         self.utt_info = self.utt_info.iloc[index]

From 77bbad4c76bf147227cce74cef2c3a8b13e4cf83 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-71-146.ec2.internal>
Date: Thu, 7 Sep 2023 19:52:13 +0000
Subject: [PATCH 74/89] Add new parameters for feat_fusion_end

---
 ...c2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml | 71 +++++++++++++++++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml | 71 +++++++++++++++++++
 .../v1/global_conf/config_lid_v7.0_13langs.sh | 42 +++++++++++
 .../v1/global_conf/config_lid_v7.1_13langs.sh | 42 +++++++++++
 .../wav2languageid/hf_wav2languageid.py       | 29 ++++++--
 .../hf_wav2vec2resnet1d_languageid.py         |  3 +-
 6 files changed, 253 insertions(+), 5 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh
 create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml
new file mode 100644
index 00000000..061014e0
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 96
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 96
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat6.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  loss: weightedCE
+  loss_weight_exp: 1.0 # 0~1
+  # focal_loss_gamma: 2.0
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml
new file mode 100644
index 00000000..4bd1ad28
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - language
+      target_sample_freq: 16000
+      wav_scale: 1
+   
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 96
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - language
+      target_sample_freq: 16000
+      wav_scale: 1
+    
+    sampler:
+      sampler_type: 'class_weighted_random_seg_chunk_sampler'
+      min_batch_size: 96
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      # weighted
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat12.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.01
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 16000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 5000
+    update_lr_on_opt_step: true
+  loss: weightedCE
+  loss_weight_exp: 1.0 # 0~1
+  # focal_loss_gamma: 2.0
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  eff_batch_size: 1024
+  # eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh
new file mode 100644
index 00000000..13ef37b4
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v7.0_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v7.0_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh
new file mode 100644
index 00000000..b00c7bb0
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh
@@ -0,0 +1,42 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs
+
+
+bpe_model=data/13_langs_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml
+nnet_s1_args=""
+nnet_name=${hf_model_name}_resnet1d_v7.1_13_langs
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.1.yaml
+nnet_s2_args=""
+nnet_s2_name=${hf_model_name}_resnet1d_v7.1_13_langs.s2
+nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0020.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py
index 22974afe..ff3a83a7 100644
--- a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py
+++ b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py
@@ -28,13 +28,16 @@ class HFWav2LanguageID(TorchModel):
     """
 
     def __init__(
-        self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_method="weighted-avg"
+        self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_end=-1, feat_fusion_method="weighted-avg"
     ):
 
         super().__init__()
         self.hf_feats = hf_feats
         self.languageid = languageid
         self.feat_fusion_start = feat_fusion_start
+        if feat_fusion_end == -1:
+            feat_fusion_end = self.hf_feats.num_encoder_layers
+        self.feat_fusion_end = feat_fusion_end
         self.feat_fusion_method = feat_fusion_method
         self._hf_context = contextlib.nullcontext()
         self._make_fuser()
@@ -44,7 +47,7 @@ def _make_fuser(self):
             self.feat_fuser = None
             return
 
-        num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start
+        num_layers = self.feat_fusion_end + 1 - self.feat_fusion_start
         layer_dim = self.hf_feats.hidden_size
         if self.feat_fusion_method == "weighted-avg":
             self.feat_fuser = nn.Parameter(torch.zeros(num_layers))
@@ -67,10 +70,11 @@ def _fuse_hid_feats(self, hid_feats):
             # There is only one layer of features
             return hid_feats[0]
 
-        hid_feats = hid_feats[self.feat_fusion_start :]
+        hid_feats = hid_feats[self.feat_fusion_start : self.feat_fusion_end + 1]
         if self.feat_fusion_method == "weighted-avg":
             hid_feats = torch.stack(hid_feats, dim=-1)
             norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1)
+            # logging.info(torch.tensor(norm_weights.values).to(device))
             feats = torch.sum(hid_feats * norm_weights, dim=-1)
         elif self.feat_fusion_method == "linear":
             hid_feats = torch.stack(hid_feats, dim=-1)
@@ -140,6 +144,7 @@ def forward_feats(
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
+            assert(len(hid_feats) == self.hf_feats.num_encoder_layers + 1)
             feats = self._fuse_hid_feats(hid_feats)
         else:
             hid_feats = None
@@ -331,6 +336,7 @@ def filter_args(**kwargs):
             "hf_feats",
             "languageid",
             "feat_fusion_start",
+            "feat_fusion_end",
             "feat_fusion_method",
         )
         args = dict((k, kwargs[k]) for k in valanguageid_args if k in kwargs)
@@ -346,6 +352,7 @@ def get_config(self):
             "hf_feats": hf_cfg,
             "languageid": xvec_cfg,
             "feat_fusion_start": self.feat_fusion_start,
+            "feat_fusion_end": self.feat_fusion_end,
             "feat_fusion_method": self.feat_fusion_method,
         }
 
@@ -370,9 +377,23 @@ def add_class_args(parser, prefix=None, skip=set()):
             type=int,
             help=(
                 "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to"
-                "the wav2vec num_layers"
+                "the feat_fusion_end"
             ),
         )
+
+
+        parser.add_argument(
+            "--feat-fusion-end",
+            default=-1,
+            type=int,
+            help=(
+                "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to"
+                "the feat_fusion_end"
+            ),
+        )
+
+                
+
         parser.add_argument(
             "--feat-fusion-method",
             default="weighted-avg",
diff --git a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
index d357cd87..fb64f060 100644
--- a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
+++ b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py
@@ -33,6 +33,7 @@ def __init__(
         hf_feats: Union[Dict, HFWav2Vec2],
         languageid: Union[Dict, ResNet1dLanguageID],
         feat_fusion_start: int = 0,
+        feat_fusion_end: int = -1,
         feat_fusion_method: str = "weighted-avg",
     ):
 
@@ -52,7 +53,7 @@ def __init__(
             assert isinstance(languageid, ResNet1dLanguageID)
             assert languageid.encoder_net.in_feats == hf_feats.hidden_size
 
-        super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_method)
+        super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_end, feat_fusion_method)
 
     @staticmethod
     def filter_args(**kwargs):

From 89c6e2016b391818c35ab91644bbd091db4f9986 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Fri, 8 Sep 2023 11:24:03 -0400
Subject: [PATCH 75/89] finished vox v1.2 except plda

---
 egs/voxceleb/v1.2/run_007_eval_be.sh          | 321 ++++++++++++++++++
 .../eval_cosine_scoring_backend_with_qmf.py   | 253 +++++++++++---
 hyperion/bin/merge_scores.py                  |  19 +-
 hyperion/bin/train_qmf.py                     | 135 ++++++++
 .../np/classifiers/logistic_regression.py     |   3 +-
 hyperion/torch/utils/misc.py                  |   4 +-
 hyperion/utils/trial_scores.py                | 138 +++++++-
 7 files changed, 800 insertions(+), 73 deletions(-)
 create mode 100755 egs/voxceleb/v1.2/run_007_eval_be.sh
 create mode 100755 hyperion/bin/train_qmf.py

diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh
new file mode 100755
index 00000000..9084d35b
--- /dev/null
+++ b/egs/voxceleb/v1.2/run_007_eval_be.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+# Copyright       2018   Johns Hopkins University (Author: Jesus Villalba)
+#                
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nnet_stage=2
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh 
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+elif [ $nnet_stage -eq 4 ];then
+  nnet=$nnet_s4
+  nnet_name=$nnet_s4_name
+elif [ $nnet_stage -eq 5 ];then
+  nnet=$nnet_s5
+  nnet_name=$nnet_s5_name
+elif [ $nnet_stage -eq 6 ];then
+  nnet=$nnet_s6
+  nnet_name=$nnet_s6_name
+fi
+
+plda_label=${plda_type}y${plda_y_dim}_v1
+be_name=lda${lda_dim}_${plda_label}_${plda_data}
+
+xvector_dir=exp/xvectors/$nnet_name
+be_dir=exp/be/$nnet_name/$be_name
+score_dir=exp/scores/$nnet_name
+score_plda_dir=$score_dir/${be_name}/plda
+score_cosine_dir=$score_dir/cosine
+score_cosine_snorm_dir=$score_dir/cosine_snorm
+score_cosine_qmf_dir=$score_dir/cosine_qmf
+
+if [ $stage -le 3 ];then
+
+  echo "Eval Voxceleb 1 with Cosine scoring"
+  num_parts=8
+  for((i=1;i<=$num_parts;i++));
+  do
+    for((j=1;j<=$num_parts;j++));
+    do
+      $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \
+		 hyp_utils/conda_env.sh \
+		 eval_cosine_scoring_backend.py \
+		 --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		 --ndx-file data/voxceleb1_test/trials.csv \
+		 --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		 --score-file $score_cosine_dir/voxceleb1_scores.csv \
+		 --enroll-part-idx $i --num-enroll-parts $num_parts \
+		 --test-part-idx $j --num-test-parts $num_parts &
+    done
+  done
+  wait
+  merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \
+		  --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+  $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \
+	     eval_verification_metrics.py \
+	     --score-files $score_cosine_dir/voxceleb1_scores.csv \
+	     --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+	     --score-names voxceleb1 \
+	     --key-names O E H \
+	     --sparse \
+	     --output-file $score_cosine_dir/voxceleb1_results.csv
+
+  cat $score_cosine_dir/voxceleb1_results.csv
+fi
+
+if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
+  echo "Eval voxsrc2 with Cosine scoring"
+  $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \
+	     hyp_utils/conda_env.sh \
+	     eval_cosine_scoring_backend.py \
+	     --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+	     --ndx-file data/voxsrc22_dev/trials.csv \
+	     --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+	     --score-file $score_cosine_dir/voxsrc22_dev_scores.csv
+
+  # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \
+  # 	     hyp_utils/conda_env.sh \
+  # 	     eval_cosine_scoring_backend.py \
+  # 	     --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \
+  # 	     --ndx-file data/voxsrc22_eval/trials.csv \
+  # 	     --enroll-map-file data/voxsrc22_eval/enrollment.csv  \
+  # 	     --score-file $score_cosine_dir/voxsrc22_eval_scores.csv
+  
+  $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \
+	     eval_verification_metrics.py \
+	     --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \
+	     --key-files data/voxsrc22_dev/trials.csv \
+	     --score-names voxsrc22_dev \
+	     --key-names all \
+	     --output-file $score_cosine_dir/voxsrc22_dev_results.csv
+
+  cat $score_cosine_dir/voxsrc22_dev_results.csv
+
+fi
+
+if [ "$do_snorm" == "true" ];then
+  if [ $stage -le 5 ];then
+    echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   eval_cosine_scoring_backend.py \
+		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		   --ndx-file data/voxceleb1_test/trials.csv \
+		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		   --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
+		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+    
+    $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \
+	       eval_verification_metrics.py \
+	       --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \
+	       --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+	       --score-names voxceleb1 \
+	       --key-names O E H \
+	       --sparse \
+	       --output-file $score_cosine_snorm_dir/voxceleb1_results.csv
+    
+    cat $score_cosine_snorm_dir/voxceleb1_results.csv
+  fi
+
+  if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then
+    echo "Eval voxsrc2 with Cosine scoring + AS-Norm"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do    
+	$train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   eval_cosine_scoring_backend.py \
+		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+		   --ndx-file data/voxsrc22_dev/trials.csv \
+		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+		   --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+	sleep 5s
+      done
+      sleep 10s
+    done
+    wait
+    merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+    $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \
+	     eval_verification_metrics.py \
+	     --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+	     --key-files data/voxsrc22_dev/trials.csv \
+	     --score-names voxsrc22_dev \
+	     --key-names all \
+	     --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv
+
+    cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv
+
+  fi
+
+fi
+
+if [ "$do_qmf" == "true" ];then
+  if [ $stage -le 7 ];then
+    echo "Train QMF in Vox2"
+    echo "...Calculating quality measures for Vox2"
+    num_parts=8
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   eval_cosine_scoring_backend_with_qmf.py \
+		   --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --ndx-file data/voxceleb2cat_train_trials/trials.csv \
+		   --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv  \
+		   --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+      		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+    train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+		 --key-file data/voxceleb2cat_train_trials/trials.csv \
+		 --model-file $score_cosine_qmf_dir/qmf.h5
+		 
+  fi
+
+  if [ $stage -le 8 ];then
+    echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   eval_cosine_scoring_backend_with_qmf.py \
+		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		   --ndx-file data/voxceleb1_test/trials.csv \
+		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		   --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --qmf-file $score_cosine_qmf_dir/qmf.h5 \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    for suffix in "" .snorm .snorm.qmf
+    do
+      (
+	merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
+			--num-enroll-parts $num_parts --num-test-parts $num_parts
+	
+	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \
+		   eval_verification_metrics.py \
+		   --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
+		   --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+		   --score-names voxceleb1 \
+		   --key-names O E H \
+		   --sparse \
+		   --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv
+
+	echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:"
+	cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv
+      ) &
+    done
+    wait
+  fi
+  
+  if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then
+    echo "Eval voxsrc2 with Cosine scoring + QMF"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do    
+	$train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   eval_cosine_scoring_backend_with_qmf.py \
+		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+		   --ndx-file data/voxsrc22_dev/trials.csv \
+		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+		   --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --qmf-file $score_cosine_qmf_dir/qmf.h5 \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+	sleep 5s
+      done
+      sleep 10s
+    done
+    wait
+    for suffix in "" .snorm .snorm.qmf
+    do
+      (
+	merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
+			--num-enroll-parts $num_parts --num-test-parts $num_parts
+
+	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \
+		   eval_verification_metrics.py \
+		   --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
+		   --key-files data/voxsrc22_dev/trials.csv \
+		   --score-names voxsrc22_dev \
+		   --key-names all \
+		   --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv
+
+	echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:"
+	cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv
+      ) &
+    done
+    wait
+  fi
+
+fi
+
diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
index f567dd81..0333669f 100755
--- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
+++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
@@ -30,6 +30,7 @@
 from hyperion.io import RandomAccessDataReaderFactory as DRF
 from hyperion.np.transforms import TransformList
 from hyperion.np.score_norm import AdaptSNorm
+from hyperion.np.classifiers import BinaryLogisticRegression as LR
 
 
 def get_precomp_qm_names(quality_measures):
@@ -38,7 +39,6 @@ def get_precomp_qm_names(quality_measures):
 
 
 def normalize_duration(q, min_dur, max_dur, frame_rate):
-
     q = q / frame_rate
     q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur))
     log_min_dur = np.log(min_dur)
@@ -99,6 +99,9 @@ def load_trial_data(
         test_segments.add_columns(test_feats_set)
         if enroll_feats_set != test_feats_set or enroll_segments != test_segments:
             enroll_segments.add_columns(enroll_feats_set)
+    else:
+        test_segments = test_feats_set
+        enroll_segments = enroll_feats_set
 
     # now we retrive the quality measures
     q_e = []
@@ -132,7 +135,6 @@ def load_trial_data(
 
 
 def load_cohort_data(segments_file, feats_file):
-
     segments = SegmentSet.load(segments_file)
     feats_reader = DRF.create(feats_file)
     x = feats_reader.read(segments["id"], squeeze=True)
@@ -160,16 +162,13 @@ def get_score_filepath(
     test_part_idx,
     num_test_parts,
 ):
-
     score_file = Path(score_file)
     new_suffix = ""
     if score_name is not None:
         new_suffix = f".{score_name}"
 
     if num_enroll_parts > 1 or num_test_parts > 1:
-        new_suffix = (
-            f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}"
-        )
+        new_suffix = f"{new_suffix}.{enroll_part_idx}.{test_part_idx}"
 
     if new_suffix:
         new_suffix = f"{new_suffix}{score_file.suffix}"
@@ -177,25 +176,58 @@ def get_score_filepath(
 
     return score_file
 
-def save_scores(ndx, scores, score_file, score_name,     enroll_part_idx,
+
+def save_scores(
+    ndx,
+    scores,
+    score_file,
+    score_name,
+    q_measures,
+    enroll_part_idx,
     num_enroll_parts,
     test_part_idx,
-    num_test_parts):
+    num_test_parts,
+):
+    score_file = get_score_filepath(
+        score_file,
+        score_name,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
+    logging.info("saving scores with to %s", score_file)
+    scores = TrialScores(
+        ndx.model_set, ndx.seg_set, scores, ndx.trial_mask, q_measures=q_measures
+    )
+    scores.save(score_file)
+
 
-def save_empty_scores(ndx, score_file, score_name,     enroll_part_idx,
+def save_empty_scores(
+    ndx,
+    score_file,
+    score_name,
+    q_measures,
+    enroll_part_idx,
     num_enroll_parts,
     test_part_idx,
-    num_test_parts):
+    num_test_parts,
+):
     scores = np.zeros(ndx.trial_mask.shape, dtype="float32")
-    score_file = get_score_filepath(score_file, score_name,enroll_part_idx,
-    num_enroll_parts,
-    test_part_idx,
-    num_test_parts)
-
-    scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
-    scores.save(score_file)
-
+    if q_measures is not None:
+        q_measures = {k: scores for k in q_measures}
 
+    save_scores(
+        ndx,
+        scores,
+        score_file,
+        score_name,
+        q_measures,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
 
 
 def segment_to_trial_qm(q_e, q_t):
@@ -226,31 +258,29 @@ def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial):
     return scores, scores_norm, q_trial
 
 
-def make_qm_table(ndx, scores, scores_norm, q_trial):
-    if scores_norm is None:
-        scores = scores[ndx.trial_mask]
-    else:
-        scores = scores_norm[ndx.trial_mask]
-
-    for qm in q_trial:
-        q_trial[qm] = q_trial[qm][ndx.trial_mask]
+# def make_qm_table(ndx, scores, scores_norm, q_trial):
+#     if scores_norm is None:
+#         scores = scores[ndx.trial_mask]
+#     else:
+#         scores = scores_norm[ndx.trial_mask]
 
-    I, J = np.nonzero(ndx.trial_mask)
-    modelid = ndx.model_set[I]
-    segmentid = ndx.seg_set[J]
-    unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)]
-
-    q_dict = {
-        "id": unique_id,
-        "modelid": modelid,
-        "segmentid": segmentid,
-        "scores": scores,
-    }
-    q_dict.update(q_trial)
-    df = pd.DataFrame(q_dict)
-    return InfoTable(df)
+#     for qm in q_trial:
+#         q_trial[qm] = q_trial[qm][ndx.trial_mask]
 
+#     I, J = np.nonzero(ndx.trial_mask)
+#     modelid = ndx.model_set[I]
+#     segmentid = ndx.seg_set[J]
+#     unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)]
 
+#     q_dict = {
+#         "id": unique_id,
+#         "modelid": modelid,
+#         "segmentid": segmentid,
+#         "scores": scores,
+#     }
+#     q_dict.update(q_trial)
+#     df = pd.DataFrame(q_dict)
+#     return InfoTable(df)
 
 
 def eval_backend(
@@ -276,7 +306,6 @@ def eval_backend(
     test_part_idx,
     num_test_parts,
 ):
-
     logging.info("loading data")
     enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data(
         enroll_map_file,
@@ -297,8 +326,43 @@ def eval_backend(
 
     if not np.any(ndx.trial_mask):
         # this part doesn't have any trials, save empty files
-        
-    
+        if qmf_file is not None:
+            quality_measures = None
+            save_empty_scores(
+                ndx,
+                score_file,
+                "snorm.qmf" if cohort_segments_file is not None else "qmf",
+                quality_measures,
+                enroll_part_idx,
+                num_enroll_parts,
+                test_part_idx,
+                num_test_parts,
+            )
+
+        save_empty_scores(
+            ndx,
+            score_file,
+            None,
+            quality_measures,
+            enroll_part_idx,
+            num_enroll_parts,
+            test_part_idx,
+            num_test_parts,
+        )
+
+        if cohort_segments_file is not None:
+            save_empty_scores(
+                ndx,
+                score_file,
+                "snorm",
+                quality_measures,
+                enroll_part_idx,
+                num_enroll_parts,
+                test_part_idx,
+                num_test_parts,
+            )
+        return
+
     enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True)
     q_e = average_qm(q_e, enroll_set, enroll_ids)
 
@@ -362,46 +426,123 @@ def eval_backend(
         enroll_set, ndx, scores, scores_norm, q_trial
     )
     if qmf_file is None:
-        qm_table = make_qm_table(ndx, scores, scores_norm, q_trial)
-        qm_file = get_score_filepath(
+        save_scores(
+            ndx,
+            scores,
             score_file,
-            "qm",
+            None,
+            q_trial,
             enroll_part_idx,
             num_enroll_parts,
             test_part_idx,
             num_test_parts,
         )
-        qm_table.save(qm_file)
+
+        if scores_norm is not None:
+            save_scores(
+                ndx,
+                scores_norm,
+                score_file,
+                "snorm",
+                q_trial,
+                enroll_part_idx,
+                num_enroll_parts,
+                test_part_idx,
+                num_test_parts,
+            )
+        # qm_table = make_qm_table(ndx, scores, scores_norm, q_trial)
+        # qm_file = get_score_filepath(
+        #     score_file,
+        #     "qm",
+        #     enroll_part_idx,
+        #     num_enroll_parts,
+        #     test_part_idx,
+        #     num_test_parts,
+        # )
+        # qm_table.save(qm_file)
         return
 
-    score_file_nonorm = get_score_filepath(
+    save_scores(
+        ndx,
+        scores,
         score_file,
         None,
+        None,
         enroll_part_idx,
         num_enroll_parts,
         test_part_idx,
         num_test_parts,
     )
-    logging.info("saving scores to %s", score_file_nonorm)
-    scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
-    scores.save(score_file_nonorm)
 
     if scores_norm is not None:
-        score_file_snorm = get_score_filepath(
+        save_scores(
+            ndx,
+            scores_norm,
             score_file,
             "snorm",
+            None,
             enroll_part_idx,
             num_enroll_parts,
             test_part_idx,
             num_test_parts,
         )
-        logging.info("saving scores with AS-Norm to %s", score_file_snorm)
-        scores.scores = scores_norm
-        scores.save(score_file_snorm)
 
+    logging.info("applying qmf")
+    if scores_norm is None:
+        score_name = "qmf"
+        scores_fus = [scores.ravel()]
+    else:
+        score_name = "snorm.qmf"
+        scores_fus = [scores_norm.ravel()]
+
+    q_names = list(q_trial.keys())
+    q_names.sort()
+    for q_name in q_names:
+        scores_fus.append(q_trial[q_name].ravel())
+
+    scores_fus = np.vstack(scores_fus).T
+    lr = LR.load(qmf_file)
+    scores_fus = lr.predict(scores_fus)
+    scores_fus = np.reshape(scores_fus, (ndx.num_models, ndx.num_tests))
+    save_scores(
+        ndx,
+        scores_fus,
+        score_file,
+        score_name,
+        None,
+        enroll_part_idx,
+        num_enroll_parts,
+        test_part_idx,
+        num_test_parts,
+    )
 
-if __name__ == "__main__":
+    # score_file_nonorm = get_score_filepath(
+    #     score_file,
+    #     None,
+    #     enroll_part_idx,
+    #     num_enroll_parts,
+    #     test_part_idx,
+    #     num_test_parts,
+    # )
+    # logging.info("saving scores to %s", score_file_nonorm)
+    # scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask)
+    # scores.save(score_file_nonorm)
+
+    # if scores_norm is not None:
+    #     score_file_snorm = get_score_filepath(
+    #         score_file,
+    #         "snorm",
+    #         enroll_part_idx,
+    #         num_enroll_parts,
+    #         test_part_idx,
+    #         num_test_parts,
+    #     )
+    #     logging.info("saving scores with AS-Norm to %s", score_file_snorm)
+    #     scores.scores = scores_norm
+    #     scores.save(score_file_snorm)
 
+
+if __name__ == "__main__":
     parser = ArgumentParser(
         description="Eval cosine-scoring with optional AS-Norm and QMF"
     )
diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py
index 6a275f5c..cb8524b7 100755
--- a/hyperion/bin/merge_scores.py
+++ b/hyperion/bin/merge_scores.py
@@ -18,14 +18,19 @@
 
 
 def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx):
-
     output_file = Path(output_file)
     output_file.parent.mkdir(exist_ok=True, parents=True)
 
     ext = output_file.suffix
 
     if input_files is None:
-        input_file_base = output_file.with_suffix("")
+        if ext in [".h5", ".csv", ".tsv"]:
+            input_file_base = output_file
+        else:
+            input_file_base = output_file.parent / (output_file.name + ".txt")
+            ext = ""
+
+        logging.info("merging %s* -> %s", input_file_base.with_suffix(""), output_file)
         input_files = []
         for i in range(num_enroll_parts):
             idx_i = base_idx + i
@@ -33,6 +38,8 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas
                 idx_j = base_idx + j
                 input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}")
                 input_files.append(input_file_i)
+    else:
+        logging.info("merging %s -> %s", " + ".join(input_files), output_file)
 
     if ext == ".h5":
         # if files are h5 we need to load everything in RAM
@@ -57,7 +64,6 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas
 
 
 if __name__ == "__main__":
-
     parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables")
     parser.add_argument("--cfg", action=ActionConfigFile)
     parser.add_argument(
@@ -88,7 +94,12 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas
         help="""index of the first job, typically 0 or 1""",
     )
     parser.add_argument(
-        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default=1,
+        choices=[0, 1, 2, 3],
+        type=int,
     )
 
     args = parser.parse_args()
diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py
new file mode 100755
index 00000000..a97e8a5f
--- /dev/null
+++ b/hyperion/bin/train_qmf.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+"""
+  Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
+  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
+
+  Trains calibration for SRE18 tel condition
+"""
+
+import sys
+import os
+from jsonargparse import (
+    ArgumentParser,
+    ActionConfigFile,
+    ActionParser,
+    namespace_to_dict,
+)
+import time
+import logging
+from pathlib import Path
+
+import numpy as np
+
+from hyperion.hyp_defs import float_cpu, config_logger
+from hyperion.utils.trial_scores import TrialScores
+from hyperion.utils.trial_key import TrialKey
+from hyperion.np.metrics import compute_act_dcf, compute_min_dcf
+from hyperion.np.classifiers import BinaryLogisticRegression as LR
+
+
+def print_q_stats(scr, q_names):
+    for k in q_names:
+        q_vec = scr.q_measures[k][scr.score_mask]
+        s = f"{k} stats mean={np.mean(q_vec)} min={np.min(q_vec)} max={np.max(q_vec)} median={np.median(q_vec)}"
+        logging.info(s)
+
+
+def train_qmf(
+    score_file, key_file, model_file, prior, lambda_reg, quality_measures, verbose
+):
+    logging.info("load key: %s", key_file)
+    key = TrialKey.load(key_file)
+    logging.info("load scores: %s", score_file)
+    scr = TrialScores.load(score_file)
+    tar, non = scr.get_tar_non(key)
+    ntar = len(tar)
+    nnon = len(non)
+
+    if quality_measures is None:
+        quality_measures = list(scr.q_measures.keys())
+        quality_measures.sort()
+
+    print_q_stats(scr, quality_measures)
+    q_tar, q_non = scr.get_tar_non_q_measures(key, quality_measures)
+
+    min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior)
+    n_miss = p_miss * ntar
+    n_fa = p_fa * nnon
+    logging.info(
+        "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f",
+        min_dcf,
+        p_miss * 100,
+        p_fa * 100,
+        n_miss,
+        n_fa,
+    )
+
+    logging.info("train calibration")
+    # tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T
+    # non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T
+    tar = np.hstack((tar[:, None], q_tar))
+    non = np.hstack((non[:, None], q_non))
+
+    x = np.vstack((tar, non))
+    y = np.concatenate(
+        (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32"))
+    )
+    lr = LR(
+        prior=prior,
+        lambda_reg=lambda_reg,
+        bias_scaling=1,
+        solver="liblinear",
+        verbose=verbose,
+    )
+    lr.fit(x, y)
+    logging.info(f"A={lr.A} b={lr.b}")
+    logging.info("save calibration at %s", model_file)
+    lr.save(model_file)
+
+    logging.info("calibrate scores")
+    tar_cal = lr.predict(tar)
+    non_cal = lr.predict(non)
+    act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior)
+    n_miss = p_miss * ntar
+    n_fa = p_fa * nnon
+    logging.info(
+        "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f",
+        act_dcf,
+        p_miss * 100,
+        p_fa * 100,
+        n_miss,
+        n_fa,
+    )
+
+    score_file = Path(score_file)
+    output_file = score_file.with_suffix(f".qmf{score_file.suffix}")
+    scr_out = TrialScores(key.model_set, key.seg_set)
+    scr_out.scores[key.tar] = tar_cal
+    scr_out.scores[key.non] = non_cal
+    scr_out.score_mask = np.logical_or(key.tar, key.non)
+    scr_out.save(output_file)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Trains QMF calibration")
+
+    parser.add_argument("--score-file", required=True)
+    parser.add_argument("--key-file", required=True)
+    parser.add_argument("--model-file", required=True)
+    parser.add_argument("--prior", type=float, default=0.01)
+    parser.add_argument("--lambda-reg", type=float, default=1e-5)
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
+    parser.add_argument(
+        "--quality-measures",
+        default=None,
+        nargs="+",
+        choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"],
+    )
+
+    args = parser.parse_args()
+    config_logger(args.verbose)
+    logging.debug(args)
+
+    train_qmf(**namespace_to_dict(args))
diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py
index 4c4c0cfc..03d9fd13 100644
--- a/hyperion/np/classifiers/logistic_regression.py
+++ b/hyperion/np/classifiers/logistic_regression.py
@@ -93,7 +93,8 @@ def __init__(
         super().__init__(**kwargs)
 
         if random_state is None:
-            random_state = np.random.default_rng(seed=lr_seed)
+            # random_state = np.random.default_rng(seed=lr_seed)
+            random_state = np.random.RandomState(seed=lr_seed)
 
         if bias_scaling is None:
             if use_bias and solver == "liblinear":
diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py
index b2a3810f..46c09080 100644
--- a/hyperion/torch/utils/misc.py
+++ b/hyperion/torch/utils/misc.py
@@ -4,8 +4,8 @@
 """
 
 import torch
-import torch.cuda.amp as amp
 import torch.nn as nn
+import torch.cuda.amp as amp
 
 
 def l2_norm(x, dim=1, axis=None):
@@ -104,3 +104,5 @@ def get_selfsim_tarnon(y, return_mask=False):
 
     mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1)
     return y_bin, mask
+
+  
diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py
index 9e7fcd5d..4a5e59da 100644
--- a/hyperion/utils/trial_scores.py
+++ b/hyperion/utils/trial_scores.py
@@ -14,7 +14,7 @@
 from ..hyp_defs import float_cpu
 
 # from .list_utils import *
-from .list_utils import sort, intersect, ismember, split_list, list2ndarray
+from .list_utils import intersect, ismember, list2ndarray, sort, split_list
 from .trial_key import TrialKey
 from .trial_ndx import TrialNdx
 
@@ -28,13 +28,22 @@ class TrialScores(object):
       seg_set: List of test segment names.
       scores: Matrix with the scores (num_models x num_segments).
       score_mask: Boolean matrix with the trials with valid scores to True (num_models x num_segments).
+      q_measures: optional dictionary of quality measure matrices
     """
 
-    def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None):
+    def __init__(
+        self,
+        model_set=None,
+        seg_set=None,
+        scores=None,
+        score_mask=None,
+        q_measures=None,
+    ):
         self.model_set = model_set
         self.seg_set = seg_set
         self.scores = scores
         self.score_mask = score_mask
+        self.q_measures = q_measures
         if (model_set is not None) and (seg_set is not None):
             self.validate()
 
@@ -57,6 +66,9 @@ def sort(self):
         ix = np.ix_(m_idx, s_idx)
         self.scores = self.scores[ix]
         self.score_mask = self.score_mask[ix]
+        if self.q_measures is not None:
+            for k in self.q_measures.keys():
+                self.q_measures[k] = self.q_measures[k][ix]
 
     def save(self, file_path, sep=None):
         """Saves object to txt/h5 file.
@@ -86,6 +98,10 @@ def save_h5(self, file_path):
             f.create_dataset("ID/column_ids", data=seg_set)
             f.create_dataset("scores", data=self.scores)
             f.create_dataset("score_mask", data=self.score_mask.astype("uint8"))
+            if self.q_measures is not None:
+                q_grp = f.create_group("q_measures")
+                for k, v in self.q_measures.items():
+                    q_grp.create_dataset(k, data=v)
 
     def save_txt(self, file_path):
         """Saves object to txt file.
@@ -105,6 +121,9 @@ def save_txt(self, file_path):
                     )
                 )
 
+        if self.q_measures is not None:
+            logging.warning("q_measures cannot be saved to txt file")
+
     def save_table(self, file_path, sep=None):
         """Saves object to pandas tabnle file.
 
@@ -116,12 +135,20 @@ def save_table(self, file_path, sep=None):
         if sep is None:
             sep = "\t" if ".tsv" in ext else ","
 
+        q_str = ""
+        if self.q_measures is not None:
+            q_str = sep + sep.join(self.q_measures.keys())
+
         with open(file_path, "w", encoding="utf-8") as f:
-            f.write(f"modelid{sep}segmentid{sep}LLR\n")
+            f.write(f"modelid{sep}segmentid{sep}LLR{q_str}\n")
             I, J = self.score_mask.nonzero()
             for i, j in zip(I, J):
+                if self.q_measures is not None:
+                    q_str = sep + sep.join(
+                        [str(v[i, j]) for k, v in self.q_measures.items()]
+                    )
                 f.write(
-                    f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n"
+                    f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}{q_str}\n"
                 )
 
     @classmethod
@@ -158,7 +185,12 @@ def load_h5(cls, file_path):
             seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]]
             scores = np.asarray(f["scores"], dtype=float_cpu())
             score_mask = np.asarray(f["score_mask"], dtype="bool")
-        return cls(model_set, seg_set, scores, score_mask)
+            if "q_measures" in f:
+                q_grp = f["q_measures"]
+                q_measures = {k: q_grp[k] for k in q_grp}
+            else:
+                q_measures = None
+        return cls(model_set, seg_set, scores, score_mask, q_measures)
 
     @classmethod
     def load_txt(cls, file_path):
@@ -217,7 +249,21 @@ def load_table(cls, file_path, sep=None):
             score_mask[i, j] = True
             scores[i, j] = score
 
-        return cls(model_set, seg_set, scores, score_mask)
+        if len(df.columns) > 3:
+            q_names = df.columns[3:]
+            q_vals = df.iloc[:, 3:].values
+            q_measures = {}
+            for q_name in q_names:
+                q_measures[q_name] = np.zeros(scores.shape, dtype=float_cpu())
+
+            for i, j, q_row in zip(model_idx, seg_idx, q_vals):
+                for col, q_name in enumerate(q_names):
+                    q_measures[q_name][i, j] = q_row[col]
+
+        else:
+            q_measures = None
+
+        return cls(model_set, seg_set, scores, score_mask, q_measures)
 
     @classmethod
     def merge(cls, scr_list):
@@ -234,6 +280,7 @@ def merge(cls, scr_list):
         seg_set = scr_list[0].seg_set
         scores = scr_list[0].scores
         score_mask = scr_list[0].score_mask
+        q_measures = scr_list[0].q_measures
         for i in range(1, num_scr):
             scr_i = scr_list[i]
             new_model_set = np.union1d(model_set, scr_i.model_set)
@@ -252,6 +299,10 @@ def merge(cls, scr_list):
             scores_1[ix_a] = scores[ix_b]
             score_mask_1 = np.zeros(shape, dtype="bool")
             score_mask_1[ix_a] = score_mask[ix_b]
+            if q_measures is not None:
+                q_measures_1 = {k: np.zeros(shape) for k in q_measures.keys()}
+                for k in q_measures.keys():
+                    q_measures_1[k][ix_a] = q_measures[k][ix_b]
 
             trial_mask_2 = np.zeros(
                 (len(new_model_set), len(new_seg_set)), dtype="bool"
@@ -268,14 +319,21 @@ def merge(cls, scr_list):
             scores_2[ix_a] = scr_i.scores[ix_b]
             score_mask_2 = np.zeros(shape, dtype="bool")
             score_mask_2[ix_a] = scr_i.score_mask[ix_b]
+            if q_measures is not None:
+                q_measures_2 = {k: np.zeros(shape) for k in q_measures.keys()}
+                for k in q_measures.keys():
+                    q_measures_2[k][ix_a] = scr_i.q_measures[k][ix_b]
 
             model_set = new_model_set
             seg_set = new_seg_set
             scores = scores_1 + scores_2
             assert not (np.any(np.logical_and(score_mask_1, score_mask_2)))
             score_mask = np.logical_or(score_mask_1, score_mask_2)
+            if q_measures is not None:
+                for k in q_measures.keys():
+                    q_measures[k] = q_measures_1[k] + q_measures_2[k]
 
-        return cls(model_set, seg_set, scores, score_mask)
+        return cls(model_set, seg_set, scores, score_mask, q_measures)
 
     def filter(self, model_set, seg_set, keep=True, raise_missing=True):
         """Removes elements from TrialScores object.
@@ -297,13 +355,17 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
 
         f_mod, mod_idx = ismember(model_set, self.model_set)
         f_seg, seg_idx = ismember(seg_set, self.seg_set)
-
+        q_measures = None
         if np.all(f_mod) and np.all(f_seg):
             model_set = self.model_set[mod_idx]
             seg_set = self.seg_set[seg_idx]
             ix = np.ix_(mod_idx, seg_idx)
             scores = self.scores[ix]
             score_mask = self.score_mask[ix]
+            if self.q_measures is not None:
+                q_measures = {}
+                for k in self.q_measures.keys():
+                    q_measures[k] = self.q_measures[k][ix]
         else:
             for i in (f_mod == 0).nonzero()[0]:
                 logging.info("model %s not found", model_set[i])
@@ -318,8 +380,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True):
             ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg])
             scores[ix1] = self.scores[ix2]
             score_mask[ix1] = self.score_mask[ix2]
+            if self.q_measures is not None:
+                q_measures = {}
+                for k in self.q_measures.keys():
+                    q_measures[k] = np.zeros(scores.shape, dtype=float_cpu())
+                    q_measures[k][ix1] = self.q_measures[k][ix2]
 
-        return TrialScores(model_set, seg_set, scores, score_mask)
+        return TrialScores(model_set, seg_set, scores, score_mask, q_measures)
 
     def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts):
         """Splits the TrialScores into num_model_parts x num_seg_parts and returns part
@@ -340,7 +407,13 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts):
         ix = np.ix_(model_idx1, seg_idx1)
         scores = self.scores[ix]
         score_mask = self.score_mask[ix]
-        return TrialScores(model_set, seg_set, scores, score_mask)
+        q_measures = None
+        if self.q_measures is not None:
+            q_measures = {}
+            for k in self.q_measures.keys():
+                q_measures[k] = self.q_measures[k][ix]
+
+        return TrialScores(model_set, seg_set, scores, score_mask, q_measures)
 
     def validate(self):
         """Validates the attributes of the TrialScores object."""
@@ -362,6 +435,10 @@ def validate(self):
         else:
             assert self.score_mask.shape == (len(self.model_set), len(self.seg_set))
 
+        if self.q_measures is not None:
+            for k in self.q_measures.keys():
+                assert self.q_measures[k].shape == self.scores.shape
+
     def align_with_ndx(self, ndx, raise_missing=True):
         """Aligns scores, model_set and seg_set with TrialNdx or TrialKey.
 
@@ -412,6 +489,34 @@ def get_tar_non(self, key):
         non = scr.scores[non_mask]
         return tar, non
 
+    def get_tar_non_q_measures(self, key, q_names=None, return_dict=False):
+        """Returns target and non target scores.
+
+        Args:
+          key: TrialKey object.
+          q_names: names of quality measures to return, if None it will return all
+
+        Returns:
+          Numpy array with target scores.
+          Numpy array with non-target scores.
+        """
+        scr = self.align_with_ndx(key)
+        tar_mask = np.logical_and(scr.score_mask, key.tar)
+        if q_names is None:
+            q_names = self.q_measures.keys()
+        tar = {}
+        for k in q_names:
+            tar[k] = self.q_measures[k][tar_mask]
+        non_mask = np.logical_and(scr.score_mask, key.non)
+        non = {}
+        for k in q_names:
+            non[k] = self.q_measures[k][non_mask]
+
+        if not return_dict:
+            tar = np.vstack(tuple(tar[k] for k in q_names)).T
+            non = np.vstack(tuple(non[k] for k in q_names)).T
+        return tar, non
+
     def set_missing_to_value(self, ndx, val):
         """Aligns the scores with a TrialNdx and sets the trials with missing
         scores to the same value.
@@ -450,6 +555,18 @@ def __eq__(self, other):
         eq = eq and np.all(self.seg_set == other.seg_set)
         eq = eq and np.all(np.isclose(self.scores, other.scores, atol=1e-5))
         eq = eq and np.all(self.score_mask == other.score_mask)
+        if self.q_measures is not None:
+            eq = eq and other.q_measures is not None
+            if eq:
+                eq = self.q_measures.keys() == other.q_measures.keys()
+                if eq:
+                    for k in self.q_measures.keys():
+                        eq = eq and np.all(
+                            np.isclose(
+                                self.q_measures[k], other.q_measures[k], atol=1e-5
+                            )
+                        )
+
         return eq
 
     def __ne__(self, other):
@@ -463,7 +580,6 @@ def __cmp__(self, other):
         return 1
 
     def test(key_file="core-core_det5_key.h5"):
-
         key = TrialKey.load(key_file)
 
         mask = np.logical_or(key.tar, key.non)

From 44f085a86b8c6e9206431cdfbb4f26954dfb4672 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Sun, 10 Sep 2023 11:16:43 -0400
Subject: [PATCH 76/89] introduce entry points

---
 README.md                                     |   4 +-
 egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml  |  34 +++
 egs/voxceleb/v1.2/run_001_prepare_data.sh     |  26 +-
 egs/voxceleb/v1.2/run_002_compute_evad.sh     |  16 +-
 .../v1.2/run_003_prepare_noises_rirs.sh       | 102 +++----
 .../v1.2/run_004_prepare_xvec_train_data.sh   |  46 +--
 egs/voxceleb/v1.2/run_005_train_xvector.sh    |   4 +-
 egs/voxceleb/v1.2/run_006_extract_xvectors.sh |  16 +-
 egs/voxceleb/v1.2/run_007_eval_be.sh          |  80 ++---
 hyperion/bin/__init__.py                      |   0
 hyperion/bin/adv_finetune_xvector_from_wav.py |  18 +-
 hyperion/bin/apply_mvn_select_frames.py       |  36 ++-
 hyperion/bin/audio_to_duration.py             |  17 +-
 hyperion/bin/compute_energy_vad.py            |  17 +-
 hyperion/bin/compute_mfcc_feats.py            |  21 +-
 hyperion/bin/copy_feats.py                    |   7 +-
 hyperion/bin/decode_wav2transducer.py         |  20 +-
 hyperion/bin/decode_wav2vec2rnn_transducer.py |  92 +++---
 hyperion/bin/eval_cosine_scoring_backend.py   |  27 +-
 .../eval_cosine_scoring_backend_with_qmf.py   |  38 +--
 hyperion/bin/eval_verification_metrics.py     |  25 +-
 ...l_xvec_cosine_scoring_from_adv_test_wav.py |  23 +-
 ...osine_scoring_from_adv_test_wav_wavegan.py |  26 +-
 ...l_xvec_cosine_scoring_from_art_test_wav.py |  27 +-
 .../eval_xvec_cosine_scoring_from_test_wav.py |  22 +-
 ...sine_scoring_from_transfer_adv_test_wav.py |  20 +-
 ...sine_scoring_from_transfer_art_test_wav.py |  27 +-
 hyperion/bin/eval_xvec_logits_from_wav.py     |  28 +-
 hyperion/bin/extract_wav2vec2xvectors.py      |  28 +-
 hyperion/bin/extract_wav2xvectors.py          |  23 +-
 hyperion/bin/extract_xvectors_from_feats.py   |  22 +-
 hyperion/bin/extract_xvectors_from_wav.py     |  23 +-
 .../extract_xvectors_slidwin_from_feats.py    |  34 ++-
 .../bin/extract_xvectors_slidwin_from_wav.py  |  36 ++-
 hyperion/bin/finetune_wav2vec2transducer.py   |  52 ++--
 hyperion/bin/finetune_wav2vec2xvector.py      |  26 +-
 hyperion/bin/finetune_wav2xvector.py          |  22 +-
 .../bin/finetune_xvector_dfr_from_feats.py    |  17 +-
 hyperion/bin/finetune_xvector_dfr_from_wav.py |  20 +-
 hyperion/bin/finetune_xvector_from_feats.py   |  16 +-
 hyperion/bin/finetune_xvector_from_wav.py     |  18 +-
 .../generate_adv_attacks_xvector_classif.py   |  31 +-
 .../bin/generate_adv_attacks_xvector_verif.py |  18 +-
 hyperion/bin/hyperion_dataset.py              |  62 ++--
 hyperion/bin/hyperion_tables.py               |  21 +-
 hyperion/bin/make_babble_noise_audio_files.py |  20 +-
 hyperion/bin/make_wav2xvector.py              |  21 +-
 hyperion/bin/merge_scores.py                  |  12 +-
 hyperion/bin/pack_wav_rirs.py                 |  15 +-
 hyperion/bin/plot_embedding_tsne.py           |  17 +-
 hyperion/bin/plot_embedding_tsne_per_class.py |  23 +-
 hyperion/bin/prepare_data.py                  |  11 +-
 hyperion/bin/preprocess_audio_files.py        |  20 +-
 .../split_dataset_into_trials_and_cohort.py   |  11 +-
 hyperion/bin/train_qmf.py                     |  28 +-
 hyperion/bin/train_wav2rnn_transducer.py      |  77 ++---
 hyperion/bin/train_wav2vec2rnn_transducer.py  |  90 +++---
 hyperion/bin/train_wav2vec2transducer.py      |  79 ++---
 hyperion/bin/train_wav2vec2xvector.py         |  28 +-
 hyperion/bin/train_wav2xvector.py             |  28 +-
 hyperion/bin/train_xvector_from_feats.py      |  18 +-
 hyperion/bin/train_xvector_from_wav.py        |  18 +-
 hyperion/io/__init__.py                       |   8 +-
 .../np/pdfs/mixtures/exp_family_mixture.py    | 165 ----------
 .../torch/lr_schedulers/red_lr_on_plateau.py  |   6 +-
 hyperion/utils/queues.py                      | 287 ------------------
 setup.py                                      |  33 +-
 67 files changed, 1110 insertions(+), 1193 deletions(-)
 create mode 100644 egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml
 create mode 100644 hyperion/bin/__init__.py
 delete mode 100644 hyperion/utils/queues.py

diff --git a/README.md b/README.md
index 7132a031..4838157b 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,11 @@ The full API is described in the documentation page [https://hyperion-ml.readthe
     We use anaconda or miniconda, though you should be able to make it work in other python distributions
     To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.:
 ```
-conda create --name ${your_env} python=3.8
+conda create --name ${your_env} python=3.11
 conda activate ${your_env}
 conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 ```
-In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions.
 
 ### Installing Hyperion
 
diff --git a/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml
new file mode 100644
index 00000000..86f55073
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml
@@ -0,0 +1,34 @@
+reverb_aug:
+  reverb_prob: 0.45
+  max_reverb_context: 0.5
+  rir_types: 
+    smallroom:
+      weight: 1
+      rir_path: csv:data/rirs_smallroom/rirs.csv
+      rir_norm: max
+    mediumroom:
+      weight: 1
+      rir_path: csv:data/rirs_mediumroom/rirs.csv
+      rir_norm: max
+    realroom:
+      weight: 1
+      rir_path: csv:data/rirs_real/rirs.csv
+      rir_norm: max
+noise_aug:
+  noise_prob: 0.7
+  noise_types: 
+    noise:
+      weight: 1
+      noise_path: data/musan_noise_proc_audio/recordings.csv
+      min_snr: 0
+      max_snr: 18
+    music:
+      weight: 1
+      noise_path: data/musan_music_proc_audio/recordings.csv
+      min_snr: 3
+      max_snr: 18
+    babble:
+      weight: 1
+      noise_path: data/musan_speech_babble/recordings.csv
+      min_snr: 3
+      max_snr: 18
diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh
index aef70e96..563d3c2d 100755
--- a/egs/voxceleb/v1.2/run_001_prepare_data.sh
+++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh
@@ -16,31 +16,31 @@ config_file=default_config.sh
 
 if [ $stage -le 1 ];then
   # Prepare the VoxCeleb2 dataset for training.
-  prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
-		  --cat-videos --use-kaldi-ids \
-		  --output-dir data/voxceleb2cat_train
+  hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
+			--cat-videos --use-kaldi-ids \
+			--output-dir data/voxceleb2cat_train
 fi
 
 if [ $stage -le 2 ];then
   # prepare voxceleb1 for test
-  prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \
-		  --use-kaldi-ids \
-		  --output-dir data/voxceleb1_test
+  hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \
+			--use-kaldi-ids \
+			--output-dir data/voxceleb1_test
 fi
 
 if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
-  prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \
-		  --vox1-corpus-dir $voxceleb1_root \
-		  --output-dir data/voxsrc22_dev
+  hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \
+			--vox1-corpus-dir $voxceleb1_root \
+			--output-dir data/voxsrc22_dev
 fi
 
 # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
-#   prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \
-# 		  --vox1-corpus-dir $voxceleb1_root \
-# 		  --output-dir data/voxsrc22_test
+  #   hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \
+  # 		  --vox1-corpus-dir $voxceleb1_root \
+  # 		  --output-dir data/voxsrc22_test
 # fi
 
 if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then
   # split vox2 into 2 parts, for cohort and qmf training
-  split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train
+  hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train
 fi
diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh
index e7593df2..acccace3 100755
--- a/egs/voxceleb/v1.2/run_002_compute_evad.sh
+++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh
@@ -48,18 +48,18 @@ if [ $stage -le 2 ];then
     echo "compute vad for $name"
     $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \
 	       hyp_utils/conda_env.sh \
-	       compute_energy_vad.py --cfg $vad_config \
+	       hyperion-compute-energy-vad --cfg $vad_config \
 	       --recordings-file data/$name/recordings.csv \
 	       --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \
 	       --part-idx JOB --num-parts $nj || exit 1
 
-    hyperion_tables.py cat \
-		       --table-type features \
-		       --output-file $vad_dir/$name/vad.csv --num-tables $nj
-    hyperion_dataset.py add_features \
-			--dataset data/$name \
-			--features-name vad \
-			--features-file $vad_dir/$name/vad.csv
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $vad_dir/$name/vad.csv --num-tables $nj
+    hyperion-dataset add_features \
+		     --dataset data/$name \
+		     --features-name vad \
+		     --features-file $vad_dir/$name/vad.csv
   done
 fi
 
diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
index aed1dae4..73c7ed82 100755
--- a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
+++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh
@@ -18,10 +18,10 @@ config_file=default_config.sh
 if [ $stage -le 1 ]; then
   for name in noise music speech
   do
-    prepare_data.py musan \
-		    --corpus-dir $musan_root \
-		    --subset $name \
-		    --output-dir data/musan_$name
+    hyperion-prepare-data musan \
+			  --corpus-dir $musan_root \
+			  --subset $name \
+			  --output-dir data/musan_$name
   done
 fi
 
@@ -37,66 +37,66 @@ if [ $stage -le 2 ]; then
     output_dir=exp/proc_audio/$name
     $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \
 	       hyp_utils/conda_env.sh \
-	       preprocess_audio_files.py \
+	       hyperion-preprocess-audio-files \
 	       --audio-format flac  \
 	       --part-idx JOB --num-parts $nj \
 	       --recordings-file $input_data_dir/recordings.csv \
 	       --output-path $output_dir \
 	       --output-recordings-file $output_dir/recordings.JOB.csv
-   
-    hyperion_tables.py cat \
-		       --table-type recordings \
-		       --output-file $output_dir/recordings.csv --num-tables $nj
-    hyperion_dataset.py set_recordings \
-			--dataset $input_data_dir \
-			--recordings-file $output_dir/recordings.csv \
-			--output-dataset $output_data_dir
-		     
+    
+    hyperion-tables cat \
+		    --table-type recordings \
+		    --output-file $output_dir/recordings.csv --num-tables $nj
+    hyperion-dataset set_recordings \
+		     --dataset $input_data_dir \
+		     --recordings-file $output_dir/recordings.csv \
+		     --output-dataset $output_data_dir
+    
     
   done
 fi
 
 if [ $stage -le 3 ]; then
-    # Create Babble noise from MUSAN speech files
-    for name in musan_speech
-    do
-      input_data_dir=data/$name
-      output_data_dir=data/${name}_babble
-      output_dir=exp/proc_audio/${name}_babble
-      $train_cmd $output_dir/log/make_babble_noise_${name}.log \
-		 hyp_utils/conda_env.sh \
-		 make_babble_noise_audio_files.py \
-		 --audio-format flac \
-		 --min-spks 3 --max-spks 10 --num-reuses 5 \
-		 --recordings-file $input_data_dir/recordings.csv \
-		 --output-path $output_dir \
-		 --output-recordings-file $output_data_dir/recordings.csv
-      hyperion_dataset.py make_from_recordings \
-			  --dataset $output_data_dir \
-			  --recordings-file $output_data_dir/recordings.csv
-    done
+  # Create Babble noise from MUSAN speech files
+  for name in musan_speech
+  do
+    input_data_dir=data/$name
+    output_data_dir=data/${name}_babble
+    output_dir=exp/proc_audio/${name}_babble
+    $train_cmd $output_dir/log/make_babble_noise_${name}.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-make-babble-noise-audio-files \
+	       --audio-format flac \
+	       --min-spks 3 --max-spks 10 --num-reuses 5 \
+	       --recordings-file $input_data_dir/recordings.csv \
+	       --output-path $output_dir \
+	       --output-recordings-file $output_data_dir/recordings.csv
+    hyperion-dataset make_from_recordings \
+		     --dataset $output_data_dir \
+		     --recordings-file $output_data_dir/recordings.csv
+  done
 fi
 
 if [ $stage -le 4 ]; then
-    if [ ! -d "RIRS_NOISES" ]; then
-      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
-      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
-      unzip rirs_noises.zip
-    fi
-    prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom
-    prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom
-    prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real
-    for rirs in rirs_smallroom rirs_mediumroom rirs_real
-    do
-      output_dir=exp/rirs/$rirs
-      data_dir=data/$rirs
-      $train_cmd $output_dir/log/pack_rirs_${name}.log \
-		 hyp_utils/conda_env.sh \
-		 pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \
-		 --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1;
-      hyperion_dataset.py add_features --dataset $data_dir \
-			  --features-name rirs --features-file $output_dir/rirs.csv
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real
+  for rirs in rirs_smallroom rirs_mediumroom rirs_real
+  do
+    output_dir=exp/rirs/$rirs
+    data_dir=data/$rirs
+    $train_cmd $output_dir/log/pack_rirs_${name}.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \
+	       --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1;
+    hyperion-dataset add_features --dataset $data_dir \
+		     --features-name rirs --features-file $output_dir/rirs.csv
 
-    done
+  done
 fi
 
diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
index 7649ff22..4e0c5b19 100755
--- a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
+++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh
@@ -35,42 +35,42 @@ if [ $stage -le 2 ];then
   
   $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \
 	     hyp_utils/conda_env.sh \
-	     preprocess_audio_files.py \
+	     hyperion-preprocess-audio-files \
 	     --audio-format flac --remove-dc-offset $vad_args \
 	     --part-idx JOB --num-parts $nj \
 	     --recordings-file data/$nnet_data/recordings.csv \
 	     --output-path $output_dir \
 	     --output-recordings-file $output_dir/recordings.JOB.csv
 
-  hyperion_tables.py cat \
-		     --table-type recordings \
-		     --output-file $output_dir/recordings.csv --num-tables $nj
+  hyperion-tables cat \
+		  --table-type recordings \
+		  --output-file $output_dir/recordings.csv --num-tables $nj
 
-  hyperion_dataset.py set_recordings $update_durs \
-		      --dataset data/$nnet_data \
-		      --recordings-file $output_dir/recordings.csv \
-		      --output-dataset data/${nnet_data}_proc_audio \
-		      --remove-features vad
+  hyperion-dataset set_recordings $update_durs \
+		   --dataset data/$nnet_data \
+		   --recordings-file $output_dir/recordings.csv \
+		   --output-dataset data/${nnet_data}_proc_audio \
+		   --remove-features vad
 fi
 
 if [ $stage -le 3 ];then
-  hyperion_dataset.py remove_short_segments \
-		      --dataset data/${nnet_data}_proc_audio \
-		      --output-dataset data/${nnet_data}_filtered \
-		      --length-name duration --min-length 2.0
+  hyperion-dataset remove_short_segments \
+		   --dataset data/${nnet_data}_proc_audio \
+		   --output-dataset data/${nnet_data}_filtered \
+		   --length-name duration --min-length 2.0
 
-  hyperion_dataset.py remove_classes_few_segments \
-		      --dataset data/${nnet_data}_filtered \
-		      --class-name speaker --min-segs 4
+  hyperion-dataset remove_classes_few_segments \
+		   --dataset data/${nnet_data}_filtered \
+		   --class-name speaker --min-segs 4
 fi
 
 if [ $stage -le 4 ];then
-  hyperion_dataset.py split_train_val \
-		      --dataset data/${nnet_data}_filtered \
-		      --val-prob 0.03 \
-		      --joint-classes speaker --min-train-samples 1 \
-		      --seed 1123581321 \
-		      --train-dataset data/${nnet_data}_xvector_train \
-		      --val-dataset data/${nnet_data}_xvector_val 
+  hyperion-dataset split_train_val \
+		   --dataset data/${nnet_data}_filtered \
+		   --val-prob 0.03 \
+		   --joint-classes speaker --min-train-samples 1 \
+		   --seed 1123581321 \
+		   --train-dataset data/${nnet_data}_xvector_train \
+		   --val-dataset data/${nnet_data}_xvector_val 
 fi
 
diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh
index d2f31ea1..2479d565 100755
--- a/egs/voxceleb/v1.2/run_005_train_xvector.sh
+++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh
@@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s1_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
     --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
     --data.train.dataset.segments-file $train_data_dir/segments.csv \
     --data.train.dataset.class-files $train_data_dir/speaker.csv \
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
     --data.train.dataset.segments-file $train_data_dir/segments.csv \
     --data.train.dataset.class-files $train_data_dir/speaker.csv \
diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh
index 09b8c8e9..0dc58048 100755
--- a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh
+++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh
@@ -58,15 +58,15 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm
     echo "Extracting x-vectors for $name"
     $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
 	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
-	      extract_wav2xvectors.py ${xvec_args} ${vad_args} \
+	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
 	      --part-idx JOB --num-parts $nj  \
 	      --recordings-file data/$name/recordings.csv \
 	      --random-utt-length --min-utt-length 2 --max-utt-length 30 \
 	      --model-path $nnet  \
 	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
-    hyperion_tables.py cat \
-		       --table-type features \
-		       --output-file $output_dir/xvector.csv --num-tables $nj
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $output_dir/xvector.csv --num-tables $nj
 
   done
 fi
@@ -88,14 +88,14 @@ if [ $stage -le 2 ]; then
     echo "Extracting x-vectors for $name"
     $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
 	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
-	      extract_wav2xvectors.py ${xvec_args} ${vad_args} \
+	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
 	      --part-idx JOB --num-parts $nj  \
 	      --recordings-file data/$name/recordings.csv \
 	      --model-path $nnet  \
 	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
-    hyperion_tables.py cat \
-		       --table-type features \
-		       --output-file $output_dir/xvector.csv --num-tables $nj
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $output_dir/xvector.csv --num-tables $nj
 
   done
 fi
diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh
index 9084d35b..53621488 100755
--- a/egs/voxceleb/v1.2/run_007_eval_be.sh
+++ b/egs/voxceleb/v1.2/run_007_eval_be.sh
@@ -56,7 +56,7 @@ if [ $stage -le 3 ];then
     do
       $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \
 		 hyp_utils/conda_env.sh \
-		 eval_cosine_scoring_backend.py \
+		 hyperion-eval-cosine-scoring-backend \
 		 --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
 		 --ndx-file data/voxceleb1_test/trials.csv \
 		 --enroll-map-file data/voxceleb1_test/enrollment.csv  \
@@ -66,11 +66,11 @@ if [ $stage -le 3 ];then
     done
   done
   wait
-  merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \
-		  --num-enroll-parts $num_parts --num-test-parts $num_parts
+  hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \
+			--num-enroll-parts $num_parts --num-test-parts $num_parts
 
   $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \
-	     eval_verification_metrics.py \
+	     hyperion-eval-verification-metrics \
 	     --score-files $score_cosine_dir/voxceleb1_scores.csv \
 	     --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
 	     --score-names voxceleb1 \
@@ -85,22 +85,22 @@ if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
   echo "Eval voxsrc2 with Cosine scoring"
   $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \
 	     hyp_utils/conda_env.sh \
-	     eval_cosine_scoring_backend.py \
+	     hyperion-eval-cosine-scoring-backend \
 	     --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
 	     --ndx-file data/voxsrc22_dev/trials.csv \
 	     --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
 	     --score-file $score_cosine_dir/voxsrc22_dev_scores.csv
 
   # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \
-  # 	     hyp_utils/conda_env.sh \
-  # 	     eval_cosine_scoring_backend.py \
-  # 	     --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \
-  # 	     --ndx-file data/voxsrc22_eval/trials.csv \
-  # 	     --enroll-map-file data/voxsrc22_eval/enrollment.csv  \
-  # 	     --score-file $score_cosine_dir/voxsrc22_eval_scores.csv
+    # 	     hyp_utils/conda_env.sh \
+    # 	     hyperion-eval-cosine-scoring-backend \
+    # 	     --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \
+    # 	     --ndx-file data/voxsrc22_eval/trials.csv \
+    # 	     --enroll-map-file data/voxsrc22_eval/enrollment.csv  \
+    # 	     --score-file $score_cosine_dir/voxsrc22_eval_scores.csv
   
   $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \
-	     eval_verification_metrics.py \
+	     hyperion-eval-verification-metrics \
 	     --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \
 	     --key-files data/voxsrc22_dev/trials.csv \
 	     --score-names voxsrc22_dev \
@@ -121,7 +121,7 @@ if [ "$do_snorm" == "true" ];then
       do
 	$train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \
 		   hyp_utils/conda_env.sh \
-		   eval_cosine_scoring_backend.py \
+		   hyperion-eval-cosine-scoring-backend \
 		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
 		   --ndx-file data/voxceleb1_test/trials.csv \
 		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
@@ -135,11 +135,11 @@ if [ "$do_snorm" == "true" ];then
       sleep 5s
     done
     wait
-    merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
-		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+    hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
+			  --num-enroll-parts $num_parts --num-test-parts $num_parts
     
     $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \
-	       eval_verification_metrics.py \
+	       hyperion-eval-verification-metrics \
 	       --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \
 	       --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
 	       --score-names voxceleb1 \
@@ -159,7 +159,7 @@ if [ "$do_snorm" == "true" ];then
       do    
 	$train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \
 		   hyp_utils/conda_env.sh \
-		   eval_cosine_scoring_backend.py \
+		   hyperion-eval-cosine-scoring-backend \
 		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
 		   --ndx-file data/voxsrc22_dev/trials.csv \
 		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
@@ -174,16 +174,16 @@ if [ "$do_snorm" == "true" ];then
       sleep 10s
     done
     wait
-    merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
-		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+    hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+			  --num-enroll-parts $num_parts --num-test-parts $num_parts
 
     $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \
-	     eval_verification_metrics.py \
-	     --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
-	     --key-files data/voxsrc22_dev/trials.csv \
-	     --score-names voxsrc22_dev \
-	     --key-names all \
-	     --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv
+	       hyperion-eval-verification-metrics \
+	       --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+	       --key-files data/voxsrc22_dev/trials.csv \
+	       --score-names voxsrc22_dev \
+	       --key-names all \
+	       --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv
 
     cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv
 
@@ -202,7 +202,7 @@ if [ "$do_qmf" == "true" ];then
       do
 	$train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \
 		   hyp_utils/conda_env.sh \
-		   eval_cosine_scoring_backend_with_qmf.py \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
 		   --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
 		   --ndx-file data/voxceleb2cat_train_trials/trials.csv \
 		   --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv  \
@@ -216,13 +216,13 @@ if [ "$do_qmf" == "true" ];then
       sleep 5s
     done
     wait
-    merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
-      		    --num-enroll-parts $num_parts --num-test-parts $num_parts
+    hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+      			  --num-enroll-parts $num_parts --num-test-parts $num_parts
 
-    train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
-		 --key-file data/voxceleb2cat_train_trials/trials.csv \
-		 --model-file $score_cosine_qmf_dir/qmf.h5
-		 
+    hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+		       --key-file data/voxceleb2cat_train_trials/trials.csv \
+		       --model-file $score_cosine_qmf_dir/qmf.h5
+    
   fi
 
   if [ $stage -le 8 ];then
@@ -234,7 +234,7 @@ if [ "$do_qmf" == "true" ];then
       do
 	$train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \
 		   hyp_utils/conda_env.sh \
-		   eval_cosine_scoring_backend_with_qmf.py \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
 		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
 		   --ndx-file data/voxceleb1_test/trials.csv \
 		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
@@ -252,11 +252,11 @@ if [ "$do_qmf" == "true" ];then
     for suffix in "" .snorm .snorm.qmf
     do
       (
-	merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
-			--num-enroll-parts $num_parts --num-test-parts $num_parts
+	hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
+			      --num-enroll-parts $num_parts --num-test-parts $num_parts
 	
 	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \
-		   eval_verification_metrics.py \
+		   hyperion-eval-verification-metrics \
 		   --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
 		   --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
 		   --score-names voxceleb1 \
@@ -280,7 +280,7 @@ if [ "$do_qmf" == "true" ];then
       do    
 	$train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \
 		   hyp_utils/conda_env.sh \
-		   eval_cosine_scoring_backend_with_qmf.py \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
 		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
 		   --ndx-file data/voxsrc22_dev/trials.csv \
 		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
@@ -299,11 +299,11 @@ if [ "$do_qmf" == "true" ];then
     for suffix in "" .snorm .snorm.qmf
     do
       (
-	merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
-			--num-enroll-parts $num_parts --num-test-parts $num_parts
+	hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
+			      --num-enroll-parts $num_parts --num-test-parts $num_parts
 
 	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \
-		   eval_verification_metrics.py \
+		   hyperion-eval-verification-metrics \
 		   --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
 		   --key-files data/voxsrc22_dev/trials.csv \
 		   --score-names voxsrc22_dev \
diff --git a/hyperion/bin/__init__.py b/hyperion/bin/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py
index f45b84a0..ea3d3b80 100755
--- a/hyperion/bin/adv_finetune_xvector_from_wav.py
+++ b/hyperion/bin/adv_finetune_xvector_from_wav.py
@@ -13,6 +13,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.adv_attacks import AttackFactory
@@ -29,8 +36,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -43,7 +48,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -138,7 +142,6 @@ def init_attack(feat_extractor, model, wav_scale, **kwargs):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -231,8 +234,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="""Fine-tune x-vector model from audio files 
         with adversarial training"""
@@ -266,6 +268,10 @@ def make_parser(xvec_class):
     train_xvec(gpu_id, args_sc)
 
 
+if __name__ == "__main__":
+    main()
+
+
 # def init_data(
 #     audio_path,
 #     train_list,
diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py
index bdf53786..f8299edc 100755
--- a/hyperion/bin/apply_mvn_select_frames.py
+++ b/hyperion/bin/apply_mvn_select_frames.py
@@ -10,6 +10,13 @@
 import time
 
 import numpy as np
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import RandomAccessDataReaderFactory as RDRF
@@ -18,8 +25,6 @@
 from hyperion.np.feats import MeanVarianceNorm as MVN
 from hyperion.utils import Utt2Info
 from hyperion.utils.kaldi_matrix import compression_methods
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def process_feats(
@@ -35,7 +40,6 @@ def process_feats(
     compression_method,
     **kwargs
 ):
-
     logging.info("initializing")
     mvn_args = MVN.filter_args(**kwargs)
     mvn = MVN(**mvn_args)
@@ -49,16 +53,23 @@ def process_feats(
 
     logging.info("opening output stream: %s" % (output_spec))
     with DWF.create(
-        output_spec, compress=compress, compression_method=compression_method,
+        output_spec,
+        compress=compress,
+        compression_method=compression_method,
     ) as writer:
-
         logging.info("opening input stream: %s" % (output_spec))
         with DRF.create(
-            input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts,
+            input_spec,
+            path_prefix=path_prefix,
+            part_idx=part_idx,
+            num_parts=num_parts,
         ) as reader:
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s" % (vad_spec))
-                v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,)
+                v_reader = RDRF.create(
+                    vad_spec,
+                    path_prefix=vad_path_prefix,
+                )
 
             while not reader.eof():
                 key, data = reader.read(1)
@@ -91,8 +102,7 @@ def process_feats(
         u2nf.save(write_num_frames_spec)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Apply CMVN and remove silence")
 
     parser.add_argument("--input", dest="input_spec", required=True)
@@ -105,7 +115,9 @@ def process_feats(
         "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix")
     )
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
     parser.add_argument(
         "--part-idx",
@@ -150,3 +162,7 @@ def process_feats(
     logging.debug(args)
 
     process_feats(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py
index ac8852a4..8ef6b5c1 100755
--- a/hyperion/bin/audio_to_duration.py
+++ b/hyperion/bin/audio_to_duration.py
@@ -9,15 +9,19 @@
 import time
 
 import numpy as np
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.utils import SegmentSet
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def audio_to_duration(audio_file, output_file, **kwargs):
-
     input_args = AR.filter_args(**kwargs)
     logging.info(f"input_args={input_args}")
 
@@ -36,8 +40,7 @@ def audio_to_duration(audio_file, output_file, **kwargs):
     seg_set.save(output_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Writes audio file durations to table")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -59,3 +62,7 @@ def audio_to_duration(audio_file, output_file, **kwargs):
     logging.debug(args)
 
     audio_to_duration(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py
index 9d50388c..fe0b1d8e 100755
--- a/hyperion/bin/compute_energy_vad.py
+++ b/hyperion/bin/compute_energy_vad.py
@@ -9,10 +9,6 @@
 import time
 
 import numpy as np
-from hyperion.hyp_defs import config_logger
-from hyperion.io import DataWriterFactory as DWF
-from hyperion.io import SequentialAudioReader as AR
-from hyperion.np.feats import EnergyVAD
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -20,9 +16,13 @@
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.io import DataWriterFactory as DWF
+from hyperion.io import SequentialAudioReader as AR
+from hyperion.np.feats import EnergyVAD
 
-def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs):
 
+def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs):
     vad_args = EnergyVAD.filter_args(**kwargs)
     vad = EnergyVAD(**vad_args)
 
@@ -78,8 +78,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs):
         f_num_frames.close()
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Compute Kaldi Energy VAD")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -105,3 +104,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs):
     logging.debug(args)
 
     compute_vad(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py
index 442e4141..f42f260d 100755
--- a/hyperion/bin/compute_mfcc_feats.py
+++ b/hyperion/bin/compute_mfcc_feats.py
@@ -9,20 +9,24 @@
 import time
 
 import numpy as np
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.io import SequentialDataReaderFactory as DRF
 from hyperion.io import compression_methods
 from hyperion.np.feats import MFCC
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def compute_mfcc_feats(
     input_path, output_path, compress, compression_method, write_num_frames, **kwargs
 ):
-
     mfcc_args = MFCC.filter_args(**kwargs)
     mfcc = MFCC(**mfcc_args)
 
@@ -34,7 +38,9 @@ def compute_mfcc_feats(
         reader = DRF.create(input_path, **input_args)
 
     writer = DWF.create(
-        output_path, compress=compress, compression_method=compression_method,
+        output_path,
+        compress=compress,
+        compression_method=compression_method,
     )
 
     if write_num_frames is not None:
@@ -68,8 +74,7 @@ def compute_mfcc_feats(
         f_num_frames.close()
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Compute MFCC features")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -109,3 +114,7 @@ def compute_mfcc_feats(
     logging.debug(args)
 
     compute_mfcc_feats(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py
index 4549caec..4ffc1a58 100755
--- a/hyperion/bin/copy_feats.py
+++ b/hyperion/bin/copy_feats.py
@@ -12,11 +12,12 @@
 import time
 
 import numpy as np
+
 from hyperion.hyp_defs import config_logger
 from hyperion.io import CopyFeats as CF
 
-if __name__ == "__main__":
 
+def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         fromfile_prefix_chars="@",
@@ -37,3 +38,7 @@
     logging.debug(args)
 
     CF(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py
index 972b247c..bcf9e05c 100755
--- a/hyperion/bin/decode_wav2transducer.py
+++ b/hyperion/bin/decode_wav2transducer.py
@@ -15,18 +15,22 @@
 import sentencepiece as spm
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.augment import SpeechAugment
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.models.wav2transducer.beam_search import (beam_search,
-                                                              greedy_search)
+from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_device(use_gpu):
@@ -118,7 +122,6 @@ def decode_one_batch(
 def decode_transducer(
     input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs
 ):
-
     device = init_device(use_gpu)
     model = load_model(model_path, device)
 
@@ -202,8 +205,7 @@ def decode_transducer(
                     )
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Extracts x-vectors from waveform computing " "acoustic features on the fly"
@@ -235,3 +237,7 @@ def decode_transducer(
     logging.debug(args)
 
     decode_transducer(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py
index 4fdc3140..33aea8c3 100755
--- a/hyperion/bin/decode_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py
@@ -15,19 +15,23 @@
 import sentencepiece as spm
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.augment import SpeechAugment
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.models import HFWav2Vec2RNNTransducer
-from hyperion.torch.models.wav2transducer.beam_search import (beam_search,
-                                                              greedy_search)
+from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_device(use_gpu):
@@ -48,10 +52,11 @@ def load_model(model_path, device):
 
 
 def decode_one_batch(
-        model: nn.Module,
-        sp: spm.SentencePieceProcessor,
-        x: torch.Tensor,
-        decoding_method="beam_search") -> Dict[str, List[List[str]]]:
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    x: torch.Tensor,
+    decoding_method="beam_search",
+) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
         - key: It indicates the setting used for decoding. For example,
@@ -77,7 +82,7 @@ def decode_one_batch(
       the returned dict.
     """
     device = model.device
-    feature = x  #batch["inputs"]
+    feature = x  # batch["inputs"]
     assert x.shape[0] == 1
     assert feature.ndim == 2
 
@@ -87,7 +92,8 @@ def decode_one_batch(
     feature_lens = torch.Tensor([x.shape[1]]).int()
 
     encoder_out, hid_feats, encoder_out_lens = model.forward_feats(
-        x=feature, x_lengths=feature_lens)
+        x=feature, x_lengths=feature_lens
+    )
 
     hyps = []
     batch_size = encoder_out.size(0)
@@ -114,9 +120,16 @@ def decode_one_batch(
         return hyps[0]
 
 
-def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
-                      infer_args, use_gpu, **kwargs):
-
+def decode_transducer(
+    input_spec,
+    output_spec,
+    scp_sep,
+    model_path,
+    bpe_model,
+    infer_args,
+    use_gpu,
+    **kwargs,
+):
     device = init_device(use_gpu)
     model = load_model(model_path, device)
 
@@ -142,8 +155,9 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
                 t2 = time.time()
                 logging.info("processing utt %s", key)
                 with torch.no_grad():
-                    x = torch.tensor(
-                        x[None, :], dtype=torch.get_default_dtype()).to(device)
+                    x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()).to(
+                        device
+                    )
 
                     tot_frames = x.shape[1]
                     logging.info(
@@ -157,10 +171,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
                     if x.shape[1] == 0:
                         y = [""]
                     else:
-                        #y = decode_one_batch(model=model, sp=sp, x=x)
-                        x_lengths = torch.tensor((x.shape[1], ),
-                                                 dtype=torch.long,
-                                                 device=device)
+                        # y = decode_one_batch(model=model, sp=sp, x=x)
+                        x_lengths = torch.tensor(
+                            (x.shape[1],), dtype=torch.long, device=device
+                        )
                         y = model.infer(x, x_lengths, **infer_args)
 
                     y = sp.decode(y[0])
@@ -172,10 +186,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
                     tot_time = t4 - t1
                     infer_time = t3 - t2
                     logging.info(
-                        ("utt %s total-time=%.3f read-time=%.3f "
-                         "infer-time=%.3f "
-                         "write-time=%.3f "
-                         "infer-rt-factor=%.2f tot-rt-factor=%.2f"),
+                        (
+                            "utt %s total-time=%.3f read-time=%.3f "
+                            "infer-time=%.3f "
+                            "write-time=%.3f "
+                            "infer-rt-factor=%.2f tot-rt-factor=%.2f"
+                        ),
                         key,
                         tot_time,
                         t2 - t1,
@@ -186,16 +202,14 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
                     )
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
-        description=("ASR decoding for RNN-T with Wav2vec features"))
+        description=("ASR decoding for RNN-T with Wav2vec features")
+    )
 
     parser.add_argument("--cfg", action=ActionConfigFile)
     parser.add_argument("--input", dest="input_spec", required=True)
-    parser.add_argument("--scp-sep",
-                        default=" ",
-                        help=("scp file field separator"))
+    parser.add_argument("--scp-sep", default=" ", help=("scp file field separator"))
 
     AR.add_class_args(parser)
     parser.add_argument("--model-path", required=True)
@@ -203,16 +217,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
 
     HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args")
     parser.add_argument("--output", dest="output_spec", required=True)
-    parser.add_argument("--use-gpu",
-                        default=False,
-                        action="store_true",
-                        help="extract xvectors in gpu")
-    parser.add_argument("-v",
-                        "--verbose",
-                        dest="verbose",
-                        default=1,
-                        choices=[0, 1, 2, 3],
-                        type=int)
+    parser.add_argument(
+        "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu"
+    )
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
 
     args = parser.parse_args()
     config_logger(args.verbose)
@@ -220,3 +230,7 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model,
     logging.debug(args)
 
     decode_transducer(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py
index 1a740024..835cae0b 100755
--- a/hyperion/bin/eval_cosine_scoring_backend.py
+++ b/hyperion/bin/eval_cosine_scoring_backend.py
@@ -4,24 +4,24 @@
   Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
 
 """
+import logging
+import time
+from pathlib import Path
+
+import numpy as np
 from jsonargparse import (
-    ArgumentParser,
     ActionConfigFile,
     ActionParser,
+    ArgumentParser,
     namespace_to_dict,
 )
-import time
-import logging
-from pathlib import Path
-
-import numpy as np
 
 from hyperion.hyp_defs import config_logger
-from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet
-from hyperion.utils.math_funcs import cosine_scoring
 from hyperion.io import RandomAccessDataReaderFactory as DRF
-from hyperion.np.transforms import TransformList
 from hyperion.np.score_norm import AdaptSNorm
+from hyperion.np.transforms import TransformList
+from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores
+from hyperion.utils.math_funcs import cosine_scoring
 
 
 def load_trial_data(
@@ -58,7 +58,6 @@ def load_trial_data(
 
 
 def load_cohort_data(segments_file, feats_file):
-
     segments = SegmentSet.load(segments_file)
     feats_reader = DRF.create(feats_file)
     x = feats_reader.read(segments["id"], squeeze=True)
@@ -81,7 +80,6 @@ def eval_backend(
     cohort_nbest,
     avg_cohort_by,
 ):
-
     logging.info("loading data")
     enroll_map, ndx, x_e, x_t = load_trial_data(
         enroll_map_file,
@@ -151,8 +149,7 @@ def eval_backend(
     scores.save(score_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm")
 
     parser.add_argument("--enroll-feats-file", default=None)
@@ -198,3 +195,7 @@ def eval_backend(
     logging.debug(args)
 
     eval_backend(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
index 0333669f..4fecf2f3 100755
--- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
+++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py
@@ -4,33 +4,33 @@
   Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)  
 
 """
-from jsonargparse import (
-    ArgumentParser,
-    ActionConfigFile,
-    ActionParser,
-    namespace_to_dict,
-)
-import time
 import logging
+import time
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
 
 from hyperion.hyp_defs import config_logger
+from hyperion.io import RandomAccessDataReaderFactory as DRF
+from hyperion.np.classifiers import BinaryLogisticRegression as LR
+from hyperion.np.score_norm import AdaptSNorm
+from hyperion.np.transforms import TransformList
 from hyperion.utils import (
-    TrialNdx,
-    TrialKey,
-    TrialScores,
     EnrollmentMap,
-    SegmentSet,
     InfoTable,
+    SegmentSet,
+    TrialKey,
+    TrialNdx,
+    TrialScores,
 )
-from hyperion.utils.math_funcs import cosine_scoring, average_vectors
-from hyperion.io import RandomAccessDataReaderFactory as DRF
-from hyperion.np.transforms import TransformList
-from hyperion.np.score_norm import AdaptSNorm
-from hyperion.np.classifiers import BinaryLogisticRegression as LR
+from hyperion.utils.math_funcs import average_vectors, cosine_scoring
 
 
 def get_precomp_qm_names(quality_measures):
@@ -542,7 +542,7 @@ def eval_backend(
     #     scores.save(score_file_snorm)
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(
         description="Eval cosine-scoring with optional AS-Norm and QMF"
     )
@@ -611,3 +611,7 @@ def eval_backend(
     logging.debug(args)
 
     eval_backend(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py
index 83227558..98fd37e2 100755
--- a/hyperion/bin/eval_verification_metrics.py
+++ b/hyperion/bin/eval_verification_metrics.py
@@ -5,19 +5,19 @@
 """
 import logging
 from pathlib import Path
-import pandas as pd
-
-from hyperion.hyp_defs import config_logger
-from hyperion.np.metrics import VerificationEvaluator as VE
 
+import pandas as pd
 from jsonargparse import (
     ActionConfigFile,
-    ActionYesNo,
     ActionParser,
+    ActionYesNo,
     ArgumentParser,
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.np.metrics import VerificationEvaluator as VE
+
 
 def eval_verification_metrics(
     key_files,
@@ -30,7 +30,6 @@ def eval_verification_metrics(
     sparse,
     output_file,
 ):
-
     assert len(key_files) == len(key_names)
     assert len(score_files) == len(score_names)
     dfs = []
@@ -61,8 +60,7 @@ def eval_verification_metrics(
     print(df.to_string(), flush=True)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Evaluate speaker verification metrics")
     parser.add_argument("--cfg", action=ActionConfigFile)
     parser.add_argument("--key-files", required=True, nargs="+")
@@ -85,7 +83,12 @@ def eval_verification_metrics(
     parser.add_argument("--sparse", default=False, action=ActionYesNo)
     parser.add_argument("--output-file", required=True)
     parser.add_argument(
-        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default=1,
+        choices=[0, 1, 2, 3],
+        type=int,
     )
 
     args = parser.parse_args()
@@ -94,3 +97,7 @@ def eval_verification_metrics(
     del kwargs["verbose"]
     del kwargs["cfg"]
     eval_verification_metrics(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
index 7c9d4104..1baad913 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py
@@ -12,6 +12,13 @@
 import pandas as pd
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -26,8 +33,6 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
@@ -44,7 +49,6 @@ def __init__(
         self.sigma = sigma
 
     def forward(self, s_t):
-
         if self.sigma > 0:
             s_t = s_t + self.sigma * torch.randn_like(s_t)
 
@@ -107,7 +111,6 @@ def load_calibrator(cal_file, threshold):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -143,7 +146,6 @@ def eval_cosine_scoring(
     num_seg_parts,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     feat_extractor = init_feats(**kwargs)
     xvector_model = load_model(model_path)
@@ -319,8 +321,7 @@ def eval_cosine_scoring(
     attack_stats.to_csv(stats_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Eval cosine-scoring given enroll x-vector and test wave"
     )
@@ -336,7 +337,9 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -415,3 +418,7 @@ def eval_cosine_scoring(
     logging.debug(args)
 
     eval_cosine_scoring(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
index fb0d402c..3e4e9229 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py
@@ -7,6 +7,7 @@
 import os
 import sys
 import time
+
 # [Added Sonal May21]
 from pathlib import Path
 
@@ -14,6 +15,13 @@
 import pandas as pd
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -29,8 +37,6 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 torch.backends.cudnn.enabled = False
 
@@ -45,7 +51,7 @@ def __init__(
         sigma=0,
         smoothing_after_wavegan=None,
         wave_gan_defender=None,
-        wav_scale=2 ** 15 - 1,
+        wav_scale=2**15 - 1,
     ):
         super().__init__()
         self.feat_extractor = feat_extractor
@@ -61,7 +67,6 @@ def __init__(
         self.apply_wavegan = False if wave_gan_defender is None else True
 
     def forward(self, s_t):
-
         # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21]
         s_t = s_t / self.wav_scale
         if self.smoothing_after_wavegan:
@@ -149,7 +154,6 @@ def load_calibrator(cal_file, threshold):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -188,7 +192,6 @@ def eval_cosine_scoring_wavegan(
     wave_gan_model_ckpt,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     feat_extractor = init_feats(**kwargs)
 
@@ -374,8 +377,7 @@ def eval_cosine_scoring_wavegan(
     attack_stats.to_csv(stats_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Eval cosine-scoring given enroll x-vector and test wave"
     )
@@ -391,7 +393,9 @@ def eval_cosine_scoring_wavegan(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -488,3 +492,7 @@ def eval_cosine_scoring_wavegan(
     logging.debug(args)
 
     eval_cosine_scoring_wavegan(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
index 2d5baa17..781cdbdf 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py
@@ -15,6 +15,13 @@
 import torch.nn as nn
 from art.classifiers import PyTorchClassifier
 from art.estimators.classification import PyTorchClassifier
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -22,16 +29,15 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import \
-    ARTAttackFactory as AttackFactory
+from hyperion.torch.adv_attacks.art_attack_factory import (
+    ARTAttackFactory as AttackFactory,
+)
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_device(use_gpu):
@@ -69,7 +75,6 @@ def load_calibrator(cal_file):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -156,7 +161,6 @@ def eval_cosine_scoring(
     num_seg_parts,
     **kwargs
 ):
-
     device_type = "gpu" if use_gpu else "cpu"
     device = init_device(use_gpu)
     feat_extractor = init_feats(**kwargs)
@@ -343,8 +347,7 @@ def eval_cosine_scoring(
     attack_stats.to_csv(stats_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Eval cosine-scoring given enroll x-vector "
@@ -363,7 +366,9 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -431,3 +436,7 @@ def eval_cosine_scoring(
     logging.debug(args)
 
     eval_cosine_scoring(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
index 76af5d75..2ebb7e3d 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py
@@ -12,6 +12,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import RandomAccessAudioReader as AR
 from hyperion.io import RandomAccessDataReaderFactory as DRF
@@ -24,8 +31,6 @@
 from hyperion.torch.utils.misc import l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_device(use_gpu):
@@ -66,7 +71,6 @@ def load_calibrator(cal_file, device):
 
 
 def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     try:
@@ -104,7 +108,6 @@ def eval_cosine_scoring(
     num_seg_parts,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
     model = load_model(model_path, device)
@@ -199,8 +202,7 @@ def eval_cosine_scoring(
     s.save_txt(score_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Eval cosine-scoring given enroll x-vector and test wave"
     )
@@ -216,7 +218,9 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -266,3 +270,7 @@ def eval_cosine_scoring(
     logging.debug(args)
 
     eval_cosine_scoring(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
index f33402a1..a6f8efa4 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py
@@ -12,6 +12,13 @@
 import pandas as pd
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -26,8 +33,6 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
@@ -104,7 +109,6 @@ def load_calibrator(cal_file, threshold):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -146,7 +150,6 @@ def eval_cosine_scoring(
     num_seg_parts,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     # load victim model
     feat_extractor = init_feats(**kwargs["feats"])
@@ -204,7 +207,7 @@ def eval_cosine_scoring(
 
     if vad_spec is not None:
         logging.info("opening VAD stream: %s", vad_spec)
-        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix
+        v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
 
     scores = np.zeros((key.num_models, key.num_tests), dtype="float32")
     attack_stats = pd.DataFrame(
@@ -337,8 +340,7 @@ def eval_cosine_scoring(
     attack_stats.to_csv(stats_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Eval cosine-scoring given enroll x-vector and "
@@ -435,3 +437,7 @@ def eval_cosine_scoring(
     logging.debug(args)
 
     eval_cosine_scoring(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
index f94dc497..7b8bc245 100755
--- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
+++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py
@@ -15,6 +15,13 @@
 import torch.nn as nn
 from art.classifiers import PyTorchClassifier
 from art.estimators.classification import PyTorchClassifier
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -22,16 +29,15 @@
 from hyperion.io import VADReaderFactory as VRF
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
 from hyperion.torch import TorchModelLoader as TML
-from hyperion.torch.adv_attacks.art_attack_factory import \
-    ARTAttackFactory as AttackFactory
+from hyperion.torch.adv_attacks.art_attack_factory import (
+    ARTAttackFactory as AttackFactory,
+)
 from hyperion.torch.layers import LinBinCalibrator as Calibrator
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
@@ -113,7 +119,6 @@ def load_calibrator(cal_file):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -155,7 +160,6 @@ def eval_cosine_scoring(
     num_seg_parts,
     **kwargs
 ):
-
     device_type = "gpu" if use_gpu else "cpu"
     device = init_device(use_gpu)
     # load victim model
@@ -361,8 +365,7 @@ def eval_cosine_scoring(
     attack_stats.to_csv(stats_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Eval cosine-scoring given enroll x-vector and "
@@ -384,7 +387,9 @@ def eval_cosine_scoring(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -456,3 +461,7 @@ def eval_cosine_scoring(
     logging.debug(args)
 
     eval_cosine_scoring(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py
index f60c7508..b2e6a665 100755
--- a/hyperion/bin/eval_xvec_logits_from_wav.py
+++ b/hyperion/bin/eval_xvec_logits_from_wav.py
@@ -12,6 +12,13 @@
 import numpy as np
 import pandas as pd
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -21,12 +28,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_device(use_gpu):
@@ -110,7 +111,6 @@ def eval_xvec(
     use_gpu,
     **kwargs
 ):
-
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
@@ -131,15 +131,16 @@ def eval_xvec(
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s", output_spec)
     with DWF.create(output_spec) as writer:
-
         logging.info(
             "opening input stream: {} with args={}".format(input_spec, ar_args)
         )
         with AR(input_spec, **ar_args) as reader:
-
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
+                v_reader = VRF.create(
+                    vad_spec,
+                    path_prefix=vad_path_prefix,
+                )
 
             while not reader.eof():
                 t1 = time.time()
@@ -224,8 +225,7 @@ def eval_xvec(
         aug_df.to_csv(aug_info_path, index=False, na_rep="n/a")
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Evaluates x-vectors logits from waveform computing "
@@ -299,3 +299,7 @@ def eval_xvec(
     logging.debug(args)
 
     eval_xvec(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py
index 5eba1b99..f2df9581 100755
--- a/hyperion/bin/extract_wav2vec2xvectors.py
+++ b/hyperion/bin/extract_wav2vec2xvectors.py
@@ -13,6 +13,13 @@
 import pandas as pd
 import torch
 import torchaudio.transforms as tat
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -21,12 +28,6 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 resamplers = {}
 
@@ -122,7 +123,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs,
 ):
-
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     model = load_model(model_path, device)
@@ -143,13 +143,14 @@ def extract_xvectors(
     ar_args["wav_scale"] = 1.0
     logging.info("opening output stream: %s", output_spec)
     with DWF.create(output_spec) as writer:
-
         logging.info(f"opening input stream: {recordings_file} with args={ar_args}")
         with AR(recordings_file, **ar_args) as reader:
-
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
+                v_reader = VRF.create(
+                    vad_spec,
+                    path_prefix=vad_path_prefix,
+                )
 
             while not reader.eof():
                 t1 = time.time()
@@ -252,8 +253,7 @@ def extract_xvectors(
         aug_df.to_csv(aug_info_path, index=False, na_rep="n/a")
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Extracts x-vectors from waveform computing " "acoustic features on the fly"
@@ -340,3 +340,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py
index 7b04fcc8..763df3fc 100755
--- a/hyperion/bin/extract_wav2xvectors.py
+++ b/hyperion/bin/extract_wav2xvectors.py
@@ -13,6 +13,13 @@
 import pandas as pd
 import torch
 import torchaudio.transforms as tat
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -21,12 +28,6 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 resamplers = {}
 
@@ -121,7 +122,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs,
 ):
-
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     model = load_model(model_path, device)
@@ -143,10 +143,8 @@ def extract_xvectors(
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args))
     with DWF.create(output_spec, metadata_columns=metadata_columns) as writer:
-
         logging.info(f"opening input stream: {recordings_file} with args={ar_args}")
         with AR(recordings_file, **ar_args) as reader:
-
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
                 v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
@@ -255,8 +253,7 @@ def extract_xvectors(
         aug_df.to_csv(aug_info_path, index=False, na_rep="n/a")
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="""Extracts x-vectors from waveform computing acoustic features on the fly"""
     )
@@ -331,3 +328,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py
index b02db70c..e70225c2 100755
--- a/hyperion/bin/extract_xvectors_from_feats.py
+++ b/hyperion/bin/extract_xvectors_from_feats.py
@@ -11,6 +11,13 @@
 
 import numpy as np
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialDataReaderFactory as DRF
@@ -19,12 +26,6 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_device(use_gpu):
@@ -82,7 +83,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs
 ):
-
     logging.info("initializing")
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
@@ -96,7 +96,6 @@ def extract_xvectors(
     dr_args = DRF.filter_args(**kwargs)
     logging.info("opening output stream: %s" % (output_spec))
     with DWF.create(output_spec) as writer:
-
         logging.info("opening input stream: %s" % (input_spec))
         with DRF.create(input_spec, **dr_args) as reader:
             if vad_spec is not None:
@@ -174,8 +173,7 @@ def extract_xvectors(
         u2nf.save(write_num_frames_spec)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Extracts x-vectors from features")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -244,3 +242,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py
index 6a8130d3..71a24bd4 100755
--- a/hyperion/bin/extract_xvectors_from_wav.py
+++ b/hyperion/bin/extract_xvectors_from_wav.py
@@ -12,6 +12,13 @@
 import numpy as np
 import pandas as pd
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -21,12 +28,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_device(use_gpu):
@@ -111,7 +112,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs
 ):
-
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
@@ -132,12 +132,10 @@ def extract_xvectors(
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s", output_spec)
     with DWF.create(output_spec) as writer:
-
         logging.info(
             "opening input stream: {} with args={}".format(recordings_file, ar_args)
         )
         with AR(recordings_file, **ar_args) as reader:
-
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
                 v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
@@ -235,8 +233,7 @@ def extract_xvectors(
         aug_df.to_csv(aug_info_path, index=False, na_rep="n/a")
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Extracts x-vectors from waveform computing acoustic features on the fly"
@@ -317,3 +314,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
index bcec5133..a1186ed2 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py
@@ -12,6 +12,13 @@
 import numpy as np
 import torch
 import yaml
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialDataReaderFactory as DRF
@@ -20,12 +27,6 @@
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_device(use_gpu):
@@ -73,7 +74,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs
 ):
-
     logging.info("initializing")
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
@@ -86,7 +86,6 @@ def extract_xvectors(
     dr_args = DRF.filter_args(**kwargs)
     logging.info("opening output stream: %s" % (output_spec))
     with DWF.create(output_spec) as writer:
-
         logging.info("opening input stream: %s" % (output_spec))
         with DRF.create(input_spec, **dr_args) as reader:
             if vad_spec is not None:
@@ -118,7 +117,13 @@ def extract_xvectors(
 
                 t4 = time.time()
                 if x.shape[0] == 0:
-                    y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),)
+                    y = np.zeros(
+                        (
+                            1,
+                            model.embed_dim,
+                        ),
+                        dtype=float_cpu(),
+                    )
                 else:
                     xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype())
                     with torch.no_grad():
@@ -195,8 +200,7 @@ def extract_xvectors(
             yaml.dump(params, f)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Extract x-vectors over a sliding window")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -208,7 +212,9 @@ def extract_xvectors(
     )
     parser.add_argument("--slidwin-params-path", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     MVN.add_class_args(parser, prefix="mvn")
@@ -298,3 +304,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
index f1a64e1b..f973b566 100755
--- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py
+++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py
@@ -13,6 +13,13 @@
 import pandas as pd
 import torch
 import yaml
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import DataWriterFactory as DWF
 from hyperion.io import SequentialAudioReader as AR
@@ -22,12 +29,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.utils import open_device
 from hyperion.utils import Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_device(use_gpu):
@@ -99,7 +100,6 @@ def extract_xvectors(
     use_gpu,
     **kwargs
 ):
-
     rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"])
     device = init_device(use_gpu)
     feat_extractor = init_feats(device, **kwargs)
@@ -124,15 +124,16 @@ def extract_xvectors(
     ar_args = AR.filter_args(**kwargs)
     logging.info("opening output stream: %s", output_spec)
     with DWF.create(output_spec) as writer:
-
         logging.info(
             "opening input stream: {} with args={}".format(input_spec, ar_args)
         )
         with AR(input_spec, **ar_args) as reader:
-
             if vad_spec is not None:
                 logging.info("opening VAD stream: %s", vad_spec)
-                v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,)
+                v_reader = VRF.create(
+                    vad_spec,
+                    path_prefix=vad_path_prefix,
+                )
 
             while not reader.eof():
                 t1 = time.time()
@@ -172,7 +173,13 @@ def extract_xvectors(
 
                         t6 = time.time()
                         if x.shape[1] == 0:
-                            y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),)
+                            y = np.zeros(
+                                (
+                                    1,
+                                    model.embed_dim,
+                                ),
+                                dtype=float_cpu(),
+                            )
                         else:
                             x = x.transpose(1, 2).contiguous()
                             y = (
@@ -255,8 +262,7 @@ def extract_xvectors(
             yaml.dump(params, f)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Extract x-vectors over a sliding window"
@@ -347,3 +353,7 @@ def extract_xvectors(
     logging.debug(args)
 
     extract_xvectors(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py
index 6f17f800..138f18f7 100755
--- a/hyperion/bin/finetune_wav2vec2transducer.py
+++ b/hyperion/bin/finetune_wav2vec2transducer.py
@@ -14,6 +14,14 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+from torch.nn.utils.rnn import pad_sequence
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
@@ -22,9 +30,6 @@
 from hyperion.torch.models import HFWav2Vec2Transducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
     "hf_wav2vec2transducer": HFWav2Vec2Transducer,
@@ -43,8 +48,7 @@ def transducer_collate(batch):
     audio = pad_sequence(audio)
     audio_length = torch.as_tensor(audio_length)
     target = k2.RaggedTensor(target)
-    return torch.transpose(audio,0,1), audio_length, target
-
+    return torch.transpose(audio, 0, 1), audio_length, target
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
@@ -73,7 +77,9 @@ def init_data(partition, rank, num_gpus, **kwargs):
     largs = (
         {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
     )
-    data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate
+    )
     return data_loader
 
 
@@ -89,11 +95,7 @@ def init_model(in_model_file, rank, model_class, **kwargs):
     return model
 
 
-
-
-
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -119,7 +121,7 @@ def train_model(gpu_id, args):
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
-    metrics = {} 
+    metrics = {}
     trainer = Trainer(
         model,
         device=device,
@@ -135,7 +137,7 @@ def train_model(gpu_id, args):
 
 def make_parser(model_class):
     parser = ArgumentParser()
-    
+
     parser.add_argument("--cfg", action=ActionConfigFile)
     train_parser = ArgumentParser(prog="")
     AD.add_class_args(train_parser, prefix="dataset", skip={})
@@ -161,27 +163,23 @@ def make_parser(model_class):
     data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
     parser.add_argument("--data", action=ActionParser(parser=data_parser))
 
-
     parser.add_argument(
         "--data.train.dataset.text_file",
-        type=str, 
+        type=str,
     )
-    
-    parser.add_argument("--data.val.dataset.text_file", type=str) 
-    
+
+    parser.add_argument("--data.val.dataset.text_file", type=str)
+
     parser.add_argument(
         "--data.train.dataset.bpe_model",
-        type=str, 
+        type=str,
     )
 
     parser.link_arguments(
         "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
     )
 
-    parser.link_arguments(
-        "data.train.dataset.bpe_model", "data.val.dataset.bpe_model"
-    )
-
+    parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model")
 
     parser.add_argument("--in-model-file", required=True)
     model_class.add_finetune_args(parser, prefix="model")
@@ -198,8 +196,10 @@ def make_parser(model_class):
     return parser
 
 
-if __name__ == "__main__":
-    parser = ArgumentParser(description="Fine-tune  Wav2Vec2Transducer model from audio files")
+def main():
+    parser = ArgumentParser(
+        description="Fine-tune  Wav2Vec2Transducer model from audio files"
+    )
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
@@ -228,3 +228,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     # multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py
index fc3c7084..7020e32f 100755
--- a/hyperion/bin/finetune_wav2vec2xvector.py
+++ b/hyperion/bin/finetune_wav2vec2xvector.py
@@ -13,18 +13,25 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
 from hyperion.torch.metrics import CategoricalAccuracy
-from hyperion.torch.models import (HFHubert2ResNet1dXVector,
-                                   HFWav2Vec2ResNet1dXVector,
-                                   HFWavLM2ResNet1dXVector)
+from hyperion.torch.models import (
+    HFHubert2ResNet1dXVector,
+    HFWav2Vec2ResNet1dXVector,
+    HFWavLM2ResNet1dXVector,
+)
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
@@ -34,7 +41,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -99,7 +105,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank):
 
 
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -182,8 +187,7 @@ def make_parser(model_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Finetunes Wav2Vec2XVector model from audio files"
     )
@@ -215,3 +219,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py
index b100b544..97356c01 100755
--- a/hyperion/bin/finetune_wav2xvector.py
+++ b/hyperion/bin/finetune_wav2xvector.py
@@ -11,6 +11,13 @@
 from pathlib import Path
 
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
@@ -27,12 +34,6 @@
 # from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -45,7 +46,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -115,7 +115,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -196,8 +195,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Fine-tune x-vector model from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -226,3 +224,7 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py
index 17cafb85..140cc3a2 100755
--- a/hyperion/bin/finetune_xvector_dfr_from_feats.py
+++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py
@@ -14,6 +14,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import ClassWeightedSeqSampler as Sampler
@@ -22,8 +29,6 @@
 from hyperion.torch.models import XVector as XVec
 from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer
 from hyperion.torch.utils import ddp, open_device
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs):
@@ -60,7 +65,6 @@ def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **k
 def init_xvector(
     num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs
 ):
-
     xvec_args = XVec.filter_finetune_args(**kwargs)
     if rank == 0:
         logging.info("xvector network ft args={}".format(xvec_args))
@@ -194,8 +198,7 @@ def train_xvec(gpu_id, args):
 #     trainer.fit(train_loader, test_loader)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Fine-tune x-vector model with deep feature loss regularization"
     )
@@ -278,3 +281,7 @@ def train_xvec(gpu_id, args):
     # del args.seed
 
     # train_xvec(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py
index f7832a47..9d745e67 100755
--- a/hyperion/bin/finetune_xvector_dfr_from_wav.py
+++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py
@@ -8,10 +8,18 @@
 import os
 import sys
 import time
+from pathlib import Path
 
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
@@ -21,8 +29,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer
 from hyperion.torch.utils import ddp, open_device
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_data(
@@ -36,7 +42,6 @@ def init_data(
     rank,
     **kwargs
 ):
-
     ad_args = AD.filter_args(**kwargs)
     sampler_args = Sampler.filter_args(**kwargs)
     if rank == 0:
@@ -82,7 +87,6 @@ def init_feats(rank, **kwargs):
 def init_xvector(
     num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs
 ):
-
     xvec_args = XVec.filter_finetune_args(**kwargs)
     if rank == 0:
         logging.info("xvector network ft args={}".format(xvec_args))
@@ -103,7 +107,6 @@ def init_xvector(
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -231,8 +234,7 @@ def train_xvec(gpu_id, args):
 #     trainer.fit(train_loader, test_loader)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Fine-tune x-vector model with deep feature loss "
@@ -327,3 +329,7 @@ def train_xvec(gpu_id, args):
     # del args.seed
 
     # train_xvec(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py
index ac9c2d0b..01e0c778 100755
--- a/hyperion/bin/finetune_xvector_from_feats.py
+++ b/hyperion/bin/finetune_xvector_from_feats.py
@@ -12,6 +12,13 @@
 
 import numpy as np
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import ClassWeightedSeqSampler as Sampler
@@ -20,8 +27,6 @@
 from hyperion.torch.models import XVector as XVec
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp, open_device
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs):
@@ -161,8 +166,7 @@ def train_xvec(gpu_id, args):
 #     trainer.fit(train_loader, test_loader)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Fine-tune x-vector model")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -230,3 +234,7 @@ def train_xvec(gpu_id, args):
     # del args.seed
 
     # train_xvec(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py
index 1c7cbe58..2c884d0b 100755
--- a/hyperion/bin/finetune_xvector_from_wav.py
+++ b/hyperion/bin/finetune_xvector_from_wav.py
@@ -11,6 +11,13 @@
 from pathlib import Path
 
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch import TorchModelLoader as TML
 from hyperion.torch.data import AudioDataset as AD
@@ -25,8 +32,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -39,7 +44,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -120,7 +124,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -208,8 +211,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Fine-tune x-vector model from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -238,3 +240,7 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py
index 4336b7b9..00452695 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_classif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py
@@ -14,6 +14,13 @@
 import torch
 import torch.nn as nn
 import yaml
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -24,12 +31,6 @@
 from hyperion.torch.utils import open_device
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialNdx, Utt2Info
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def read_utt_list(list_file, class2int_file, part_idx, num_parts):
@@ -156,14 +157,13 @@ def generate_attacks(
     num_parts,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     model = init_model(model_path, **kwargs)
     model.to(device)
 
     logging.info("opening audio read stream: %s" % (wav_file))
     audio_args = AR.filter_args(**kwargs)
-    audio_reader = AR(wav_file ** audio_args)
+    audio_reader = AR(wav_file**audio_args)
     wav_scale = audio_reader.wav_scale
 
     logging.info("opening audio write stream: %s" % (output_wav_dir))
@@ -207,7 +207,7 @@ def generate_attacks(
         s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device)
         target = torch.as_tensor([class_id], dtype=torch.long).to(device)
         if vad_spec is not None:
-            vad = v_reader.read([key.seg_set[j]])[0]
+            vad = v_reader.read([key])[0]
             tot_frames = len(vad)
             speech_frames = np.sum(vad)
             vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to(
@@ -217,7 +217,7 @@ def generate_attacks(
             logging.info(
                 "utt %s detected %d/%d (%.2f %%) speech frames"
                 % (
-                    key.seg_set[j],
+                    key,
                     speech_frames,
                     tot_frames,
                     speech_frames / tot_frames * 100,
@@ -315,8 +315,7 @@ def generate_attacks(
             yaml.dump(attacks_info, f, sort_keys=True)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Generate Attacks for speaker classification with x-vectors"
     )
@@ -332,7 +331,9 @@ def generate_attacks(
 
     parser.add_argument("--vad", dest="vad_spec", default=None)
     parser.add_argument(
-        "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"),
+        "--vad-path-prefix",
+        default=None,
+        help=("scp file_path prefix for vad"),
     )
 
     parser.add_argument("--model-path", required=True)
@@ -413,3 +414,7 @@ def generate_attacks(
     logging.debug(args)
 
     generate_attacks(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py
index 363e3afc..ab7d907b 100755
--- a/hyperion/bin/generate_adv_attacks_xvector_verif.py
+++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py
@@ -14,6 +14,13 @@
 import torch
 import torch.nn as nn
 import yaml
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu
 from hyperion.io import AudioWriter as AW
 from hyperion.io import RandomAccessAudioReader as AR
@@ -28,8 +35,6 @@
 from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm
 from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info
 from hyperion.utils.list_utils import ismember
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 
 class MyModel(nn.Module):
@@ -73,7 +78,6 @@ def forward(self, s_t):
 
 
 def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts):
-
     r = DRF.create(v_file)
     enroll = Utt2Info.load(enroll_file)
     key = TrialKey.load(key_file)
@@ -173,7 +177,6 @@ def generate_attacks(
     random_seed,
     **kwargs
 ):
-
     device = init_device(use_gpu)
     model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs)
     model.to(device)
@@ -346,8 +349,7 @@ def generate_attacks(
             yaml.dump(attacks_info, f, sort_keys=True)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Generate Attacks for speaker verification with x-vectors+cos+calibration"
     )
@@ -442,3 +444,7 @@ def generate_attacks(
     logging.debug(args)
 
     generate_attacks(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py
index 2e3a35ec..17fff2ba 100755
--- a/hyperion/bin/hyperion_dataset.py
+++ b/hyperion/bin/hyperion_dataset.py
@@ -7,6 +7,14 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ActionYesNo,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.utils import (
     ClassInfo,
@@ -18,13 +26,6 @@
     RecordingSet,
     SegmentSet,
 )
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-    ActionYesNo,
-)
 
 subcommand_list = [
     "add_features",
@@ -41,7 +42,12 @@
 
 def add_common_args(parser):
     parser.add_argument(
-        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int,
+        "-v",
+        "--verbose",
+        dest="verbose",
+        default=1,
+        choices=[0, 1, 2, 3],
+        type=int,
     )
 
 
@@ -145,7 +151,8 @@ def make_make_from_recordings_parser():
 
 
 def make_from_recordings(
-    dataset: PathLike, recordings_file: PathLike,
+    dataset: PathLike,
+    recordings_file: PathLike,
 ):
     output_dataset = dataset
     import pandas as pd
@@ -186,7 +193,10 @@ def make_remove_short_segments_parser():
 
 
 def remove_short_segments(
-    dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike,
+    dataset: PathLike,
+    min_length: float,
+    length_name: str,
+    output_dataset: PathLike,
 ):
     if output_dataset is None:
         output_dataset = dataset
@@ -216,7 +226,9 @@ def make_rebuild_class_idx_parser():
 
 
 def rebuild_class_idx(
-    dataset: PathLike, class_name: str, output_dataset: PathLike,
+    dataset: PathLike,
+    class_name: str,
+    output_dataset: PathLike,
 ):
     if output_dataset is None:
         output_dataset = dataset
@@ -301,14 +313,21 @@ def make_split_train_val_parser():
         help="""types of classes that need to have different classes in train and val""",
     )
     parser.add_argument(
-        "--seed", default=11235813, type=int, help="""random seed""",
+        "--seed",
+        default=11235813,
+        type=int,
+        help="""random seed""",
     )
 
     parser.add_argument(
-        "--train-dataset", required=True, help="""output train dataset dir""",
+        "--train-dataset",
+        required=True,
+        help="""output train dataset dir""",
     )
     parser.add_argument(
-        "--val-dataset", required=True, help="""output val dataset dir""",
+        "--val-dataset",
+        required=True,
+        help="""output val dataset dir""",
     )
 
     add_common_args(parser)
@@ -361,7 +380,8 @@ def make_copy_parser():
 
 
 def copy(
-    dataset: PathLike, output_dataset: PathLike,
+    dataset: PathLike,
+    output_dataset: PathLike,
 ):
     dataset = Dataset.load(dataset, lazy=True)
     dataset.save(output_dataset)
@@ -383,7 +403,10 @@ def make_add_cols_to_segments_parser():
         help="""columns to copy to segments table""",
     )
     parser.add_argument(
-        "--on", default=["id"], nargs="+", help="""columns to match both tables rows""",
+        "--on",
+        default=["id"],
+        nargs="+",
+        help="""columns to match both tables rows""",
     )
     parser.add_argument(
         "--right-on",
@@ -418,8 +441,7 @@ def add_cols_to_segments(
     dataset.save(output_dataset)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -436,3 +458,7 @@ def add_cols_to_segments(
     del kwargs["verbose"]
     del kwargs["cfg"]
     globals()[subcommand](**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py
index 7f61b35a..59472d83 100755
--- a/hyperion/bin/hyperion_tables.py
+++ b/hyperion/bin/hyperion_tables.py
@@ -7,6 +7,13 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.utils import (
     ClassInfo,
@@ -17,12 +24,6 @@
     RecordingSet,
     SegmentSet,
 )
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 subcommand_list = ["cat"]
 table_dict = {
@@ -87,7 +88,6 @@ def cat(
     num_tables: int,
     base_idx: int = 1,
 ):
-
     assert input_files is not None or num_tables != 0
     output_file = Path(output_file)
     if input_files is None:
@@ -108,8 +108,7 @@ def cat(
     output_table.save(output_file)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -126,3 +125,7 @@ def cat(
     del kwargs["verbose"]
     del kwargs["cfg"]
     globals()[subcommand](**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py
index 68e5b22b..43d6ab91 100755
--- a/hyperion/bin/make_babble_noise_audio_files.py
+++ b/hyperion/bin/make_babble_noise_audio_files.py
@@ -10,11 +10,6 @@
 import time
 
 import numpy as np
-from hyperion.hyp_defs import config_logger
-from hyperion.io import AudioWriter as Writer
-from hyperion.io import RandomAccessAudioReader as AR
-from hyperion.io import VADReaderFactory as VRF
-from hyperion.utils import Utt2Info
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -22,9 +17,14 @@
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.io import AudioWriter as Writer
+from hyperion.io import RandomAccessAudioReader as AR
+from hyperion.io import VADReaderFactory as VRF
+from hyperion.utils import Utt2Info
+
 
 def make_noise(xs, max_value):
-
     lens = np.array([x.shape[0] for x in xs])
     max_len = np.max(lens)
     num_tiles = np.ceil(max_len / lens)
@@ -53,7 +53,6 @@ def make_babble_noise_audio_files(
     random_seed=112358,
     **kwargs,
 ):
-
     input_args = AR.filter_args(**kwargs)
     output_args = Writer.filter_args(**kwargs)
     logging.info(f"input_args={input_args}")
@@ -105,8 +104,7 @@ def make_babble_noise_audio_files(
     logging.info("finished making babble files, elapsed-time=%f", time.time() - t1)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Creates babble noise by adding speech files")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -137,3 +135,7 @@ def make_babble_noise_audio_files(
     logging.debug(args)
 
     make_babble_noise_audio_files(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py
index b5972d1b..b3a1a2d5 100755
--- a/hyperion/bin/make_wav2xvector.py
+++ b/hyperion/bin/make_wav2xvector.py
@@ -12,6 +12,13 @@
 import numpy as np
 import pandas as pd
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 
 # from hyperion.torch import TorchModelLoader as TML
@@ -26,12 +33,6 @@
 from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec
 from hyperion.torch.models import Wav2ResNetXVector as W2RXVec
 from hyperion.torch.narchs import AudioFeatsMVN as AF
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 
 def init_feats(feats):
@@ -51,7 +52,6 @@ def load_model(model_path):
 
 
 def make_wav2xvector(feats, xvector_path, output_path):
-
     feats = init_feats(feats)
     xvector_model = load_model(xvector_path)
     if isinstance(xvector_model, RXVec):
@@ -67,8 +67,7 @@ def make_wav2xvector(feats, xvector_path, output_path):
     model.save(output_path)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="""Combines the feature extractor config with XVector model
         to produce a Wav2XVector model with integrated feature extraction"""
@@ -89,3 +88,7 @@ def make_wav2xvector(feats, xvector_path, output_path):
     logging.debug(args)
 
     make_wav2xvector(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py
index cb8524b7..72ab6010 100755
--- a/hyperion/bin/merge_scores.py
+++ b/hyperion/bin/merge_scores.py
@@ -6,9 +6,6 @@
 import logging
 from pathlib import Path
 
-from hyperion.hyp_defs import config_logger
-
-from hyperion.utils import TrialScores
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -16,6 +13,9 @@
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import TrialScores
+
 
 def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx):
     output_file = Path(output_file)
@@ -63,7 +63,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas
                         write_header = False
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables")
     parser.add_argument("--cfg", action=ActionConfigFile)
     parser.add_argument(
@@ -108,3 +108,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas
     del kwargs["verbose"]
     del kwargs["cfg"]
     merge_scores(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py
index b2a1bc2b..bf88d674 100755
--- a/hyperion/bin/pack_wav_rirs.py
+++ b/hyperion/bin/pack_wav_rirs.py
@@ -10,9 +10,6 @@
 import time
 
 import numpy as np
-from hyperion.hyp_defs import config_logger
-from hyperion.io import DataWriterFactory as DWF
-from hyperion.io import SequentialAudioReader as AR
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -20,9 +17,12 @@
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.io import DataWriterFactory as DWF
+from hyperion.io import SequentialAudioReader as AR
 
-def pack_wav_rirs(input_path, output_spec, **kwargs):
 
+def pack_wav_rirs(input_path, output_spec, **kwargs):
     writer = DWF.create(output_spec, compress=False)
     t1 = time.time()
     with AR(input_path, wav_scale=1) as reader:
@@ -47,8 +47,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs):
     logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -69,3 +68,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs):
     logging.debug(args)
 
     pack_wav_rirs(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py
index e2157e3e..60d7ac5c 100755
--- a/hyperion/bin/plot_embedding_tsne.py
+++ b/hyperion/bin/plot_embedding_tsne.py
@@ -13,12 +13,18 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ActionYesNo,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger
 from hyperion.io import RandomAccessDataReaderFactory as DRF
 from hyperion.np.transforms import PCA, LNorm, SklTSNE
 from hyperion.utils import SegmentSet
-from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo,
-                          ArgumentParser, namespace_to_dict)
 
 matplotlib.use("Agg")
 colors = ["b", "g", "r", "c", "m", "y", "k"]
@@ -40,7 +46,6 @@ def plot_embedding_tsne(
     output_dir,
     **kwargs,
 ):
-
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     logging.info("loading data")
@@ -126,8 +131,7 @@ def plot_embedding_tsne(
     # plt.clf()
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Projects embeddings using TSNE")
 
     parser.add_argument("--train-v-file", required=True)
@@ -162,6 +166,9 @@ def plot_embedding_tsne(
     plot_embedding_tsne(**namespace_to_dict(args))
 
 
+if __name__ == "__main__":
+    main()
+
 # #!/usr/bin/env python
 # """
 #  Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py
index 14da4d07..08e4ef70 100755
--- a/hyperion/bin/plot_embedding_tsne_per_class.py
+++ b/hyperion/bin/plot_embedding_tsne_per_class.py
@@ -13,12 +13,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from hyperion.hyp_defs import config_logger
-from hyperion.io import RandomAccessDataReaderFactory as DRF
-from hyperion.np.clustering import AHC
-from hyperion.np.transforms import PCA, LNorm, SklTSNE
-from hyperion.utils import SegmentSet
-from hyperion.utils.math_funcs import cosine_scoring
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -27,6 +21,13 @@
     namespace_to_dict,
 )
 
+from hyperion.hyp_defs import config_logger
+from hyperion.io import RandomAccessDataReaderFactory as DRF
+from hyperion.np.clustering import AHC
+from hyperion.np.transforms import PCA, LNorm, SklTSNE
+from hyperion.utils import SegmentSet
+from hyperion.utils.math_funcs import cosine_scoring
+
 matplotlib.use("Agg")
 colors = ["b", "g", "r", "c", "m", "y", "k"]
 markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"]
@@ -50,7 +51,6 @@ def plot_embedding_tsne(
     output_dir,
     **kwargs,
 ):
-
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     logging.info("loading data")
@@ -92,7 +92,7 @@ def plot_embedding_tsne(
         if do_ahc:
             if cluster_tsne:
                 # in the low dim space, we cannot use cosine scoring
-                x2 = np.sum(x_tsne ** 2, axis=1)[:, None]
+                x2 = np.sum(x_tsne**2, axis=1)[:, None]
                 d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T
                 d2 = np.clip(d2, a_min=0, a_max=None)
                 scores = -np.sqrt(d2)
@@ -140,8 +140,7 @@ def plot_embedding_tsne(
         train_segs.save(output_dir / "segments.csv")
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description=(
             "Projects embeddings using TSNE, "
@@ -194,3 +193,7 @@ def plot_embedding_tsne(
     logging.debug(args)
 
     plot_embedding_tsne(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py
index f6723c7d..dd1bde27 100755
--- a/hyperion/bin/prepare_data.py
+++ b/hyperion/bin/prepare_data.py
@@ -6,8 +6,6 @@
 import logging
 from pathlib import Path
 
-from hyperion.data_prep import DataPrep
-from hyperion.hyp_defs import config_logger
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -15,6 +13,9 @@
     namespace_to_dict,
 )
 
+from hyperion.data_prep import DataPrep
+from hyperion.hyp_defs import config_logger
+
 
 def make_parser(data_prep_class):
     parser = ArgumentParser()
@@ -22,7 +23,7 @@ def make_parser(data_prep_class):
     return parser
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(
         description="""Prepares a dataset into relational database tables"""
     )
@@ -39,3 +40,7 @@ def make_parser(data_prep_class):
     args = namespace_to_dict(args)[args.subcommand]
     data_prep = data_prep_class(**args)
     data_prep.prepare()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py
index bda9a503..5e98a477 100755
--- a/hyperion/bin/preprocess_audio_files.py
+++ b/hyperion/bin/preprocess_audio_files.py
@@ -10,11 +10,6 @@
 import time
 
 import numpy as np
-from hyperion.hyp_defs import config_logger
-from hyperion.io import AudioWriter as Writer
-from hyperion.io import SequentialAudioReader as AR
-from hyperion.io import VADReaderFactory as VRF
-from hyperion.utils import Utt2Info
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -23,6 +18,12 @@
 )
 from scipy import ndimage, signal
 
+from hyperion.hyp_defs import config_logger
+from hyperion.io import AudioWriter as Writer
+from hyperion.io import SequentialAudioReader as AR
+from hyperion.io import VADReaderFactory as VRF
+from hyperion.utils import Utt2Info
+
 
 def resample_vad(vad, length):
     step = (len(vad) - 1) / length
@@ -59,7 +60,6 @@ def process_audio_files(
     remove_dc_offset=False,
     **kwargs,
 ):
-
     input_args = AR.filter_args(**kwargs)
     output_args = Writer.filter_args(**kwargs)
     logging.info(f"input_args={input_args}")
@@ -72,7 +72,6 @@ def process_audio_files(
     with AR(recordings_file, **input_args) as reader, Writer(
         output_path, output_recordings_file, **output_args
     ) as writer:
-
         if vad_spec is not None:
             logging.info("opening VAD stream: %s", vad_spec)
             v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix)
@@ -147,8 +146,7 @@ def process_audio_files(
         u2td.save(write_time_durs_spec)
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(
         description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format"
     )
@@ -204,3 +202,7 @@ def process_audio_files(
     logging.debug(args)
 
     process_audio_files(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py
index 24ec10bf..50c2f1f2 100755
--- a/hyperion/bin/split_dataset_into_trials_and_cohort.py
+++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py
@@ -6,8 +6,6 @@
 import logging
 from pathlib import Path
 
-from hyperion.hyp_defs import config_logger
-from hyperion.utils import Dataset
 from jsonargparse import (
     ActionConfigFile,
     ActionParser,
@@ -16,8 +14,11 @@
     namespace_to_dict,
 )
 
-if __name__ == "__main__":
+from hyperion.hyp_defs import config_logger
+from hyperion.utils import Dataset
 
+
+def main():
     parser = ArgumentParser(
         description=(
             """Split speakers in dataset into test speaker to create ASV trials and 
@@ -66,3 +67,7 @@
     trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args)
     trials_dataset.save(trials_dir)
     cohort_dataset.save(cohort_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py
index a97e8a5f..42aabe0c 100755
--- a/hyperion/bin/train_qmf.py
+++ b/hyperion/bin/train_qmf.py
@@ -6,25 +6,25 @@
   Trains calibration for SRE18 tel condition
 """
 
-import sys
+import logging
 import os
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
 from jsonargparse import (
-    ArgumentParser,
     ActionConfigFile,
     ActionParser,
+    ArgumentParser,
     namespace_to_dict,
 )
-import time
-import logging
-from pathlib import Path
 
-import numpy as np
-
-from hyperion.hyp_defs import float_cpu, config_logger
-from hyperion.utils.trial_scores import TrialScores
-from hyperion.utils.trial_key import TrialKey
-from hyperion.np.metrics import compute_act_dcf, compute_min_dcf
+from hyperion.hyp_defs import config_logger, float_cpu
 from hyperion.np.classifiers import BinaryLogisticRegression as LR
+from hyperion.np.metrics import compute_act_dcf, compute_min_dcf
+from hyperion.utils.trial_key import TrialKey
+from hyperion.utils.trial_scores import TrialScores
 
 
 def print_q_stats(scr, q_names):
@@ -110,7 +110,7 @@ def train_qmf(
     scr_out.save(output_file)
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(description="Trains QMF calibration")
 
     parser.add_argument("--score-file", required=True)
@@ -133,3 +133,7 @@ def train_qmf(
     logging.debug(args)
 
     train_qmf(**namespace_to_dict(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py
index 8930b299..c00c4633 100755
--- a/hyperion/bin/train_wav2rnn_transducer.py
+++ b/hyperion/bin/train_wav2rnn_transducer.py
@@ -14,15 +14,20 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+from torch.nn.utils.rnn import pad_sequence
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
 from hyperion.torch.models import Wav2RNNRNNTransducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
     "rnn_rnn_transducer": Wav2RNNRNNTransducer,
@@ -72,14 +77,12 @@ def init_data(partition, rank, num_gpus, **kwargs):
 
     num_workers = data_kwargs["data_loader"]["num_workers"]
     num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
-    largs = ({
-        "num_workers": num_workers_per_gpu,
-        "pin_memory": True
-    } if num_gpus > 0 else {})
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_sampler=sampler,
-                                              **largs,
-                                              collate_fn=transducer_collate)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate
+    )
     return data_loader
 
 
@@ -97,7 +100,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs):
 
 
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -105,8 +107,8 @@ def train_model(gpu_id, args):
     kwargs = namespace_to_dict(args)
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
-    #torch.backends.cudnn.deterministic = True
-    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.benchmark = False
     torch.backends.cudnn.enabled = False
 
     ddp_args = ddp.filter_ddp_args(**kwargs)
@@ -115,8 +117,11 @@ def train_model(gpu_id, args):
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
-    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
-                       train_loader.dataset.sp.get_piece_size(), **kwargs)
+    model = init_model(
+        train_loader.dataset.sp.piece_to_id("<blk>"),
+        train_loader.dataset.sp.get_piece_size(),
+        **kwargs,
+    )
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
@@ -159,8 +164,7 @@ def make_parser(model_class):
         help="num_workers of data loader",
     )
     data_parser = ArgumentParser(prog="")
-    data_parser.add_argument("--train",
-                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
     data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
     parser.add_argument("--data", action=ActionParser(parser=data_parser))
 
@@ -176,34 +180,27 @@ def make_parser(model_class):
         type=str,
     )
 
-    parser.link_arguments("data.train.data_loader.num_workers",
-                          "data.val.data_loader.num_workers")
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
 
-    parser.link_arguments("data.train.dataset.bpe_model",
-                          "data.val.dataset.bpe_model")
+    parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model")
 
     model_class.add_class_args(parser, prefix="model")
-    Trainer.add_class_args(parser,
-                           prefix="trainer",
-                           train_modes=model_class.valid_train_modes())
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
     ddp.add_ddp_args(parser)
-    parser.add_argument("--seed",
-                        type=int,
-                        default=1123581321,
-                        help="random seed")
-    parser.add_argument("-v",
-                        "--verbose",
-                        dest="verbose",
-                        default=1,
-                        choices=[0, 1, 2, 3],
-                        type=int)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
 
     return parser
 
 
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description="Train RNN Transducer model from audio files")
+def main():
+    parser = ArgumentParser(description="Train RNN Transducer model from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
@@ -232,3 +229,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     # multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py
index 7018c406..5b802454 100755
--- a/hyperion/bin/train_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/train_wav2vec2rnn_transducer.py
@@ -14,23 +14,29 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+from torch.nn.utils.rnn import pad_sequence
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
-from hyperion.torch.models import (HFWav2Vec2ConformerV1RNNTransducer,
-                                   HFWav2Vec2RNNRNNTransducer,
-                                   HFWav2Vec2RNNTransducer)
+from hyperion.torch.models import (
+    HFWav2Vec2ConformerV1RNNTransducer,
+    HFWav2Vec2RNNRNNTransducer,
+    HFWav2Vec2RNNTransducer,
+)
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
     "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer,
     "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer,
-    "hf_wav2vec2conformer_v1_rnn_transducer":
-    HFWav2Vec2ConformerV1RNNTransducer,
+    "hf_wav2vec2conformer_v1_rnn_transducer": HFWav2Vec2ConformerV1RNNTransducer,
     # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer,
     # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer,
     # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer,
@@ -88,14 +94,12 @@ def init_data(partition, rank, num_gpus, **kwargs):
 
     num_workers = data_kwargs["data_loader"]["num_workers"]
     num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
-    largs = ({
-        "num_workers": num_workers_per_gpu,
-        "pin_memory": True
-    } if num_gpus > 0 else {})
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_sampler=sampler,
-                                              **largs,
-                                              collate_fn=transducer_collate)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate
+    )
     return data_loader
 
 
@@ -113,7 +117,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs):
 
 
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -121,8 +124,8 @@ def train_model(gpu_id, args):
     kwargs = namespace_to_dict(args)
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
-    #torch.backends.cudnn.deterministic = True
-    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.benchmark = False
     torch.backends.cudnn.enabled = False
 
     ddp_args = ddp.filter_ddp_args(**kwargs)
@@ -137,13 +140,16 @@ def train_model(gpu_id, args):
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
-    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
-                       train_loader.dataset.sp.get_piece_size(), **kwargs)
+    model = init_model(
+        train_loader.dataset.sp.piece_to_id("<blk>"),
+        train_loader.dataset.sp.get_piece_size(),
+        **kwargs,
+    )
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
-    metrics = {}  #{"acc": CategoricalAccuracy()}
+    metrics = {}  # {"acc": CategoricalAccuracy()}
     trainer = Trainer(
         model,
         device=device,
@@ -181,8 +187,7 @@ def make_parser(model_class):
         help="num_workers of data loader",
     )
     data_parser = ArgumentParser(prog="")
-    data_parser.add_argument("--train",
-                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
     data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
     parser.add_argument("--data", action=ActionParser(parser=data_parser))
 
@@ -198,34 +203,29 @@ def make_parser(model_class):
         type=str,
     )
 
-    parser.link_arguments("data.train.data_loader.num_workers",
-                          "data.val.data_loader.num_workers")
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
 
-    parser.link_arguments("data.train.dataset.bpe_model",
-                          "data.val.dataset.bpe_model")
+    parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model")
 
     model_class.add_class_args(parser, prefix="model")
-    Trainer.add_class_args(parser,
-                           prefix="trainer",
-                           train_modes=model_class.valid_train_modes())
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
     ddp.add_ddp_args(parser)
-    parser.add_argument("--seed",
-                        type=int,
-                        default=1123581321,
-                        help="random seed")
-    parser.add_argument("-v",
-                        "--verbose",
-                        dest="verbose",
-                        default=1,
-                        choices=[0, 1, 2, 3],
-                        type=int)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
 
     return parser
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(
-        description="Train Wav2Vec2Transducer model from audio files")
+        description="Train Wav2Vec2Transducer model from audio files"
+    )
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
@@ -254,3 +254,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     # multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py
index 55f3b996..77a22bb8 100755
--- a/hyperion/bin/train_wav2vec2transducer.py
+++ b/hyperion/bin/train_wav2vec2transducer.py
@@ -14,6 +14,14 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+from torch.nn.utils.rnn import pad_sequence
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
@@ -21,9 +29,6 @@
 from hyperion.torch.models import HFWav2Vec2Transducer
 from hyperion.torch.trainers import TransducerTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
-from torch.nn.utils.rnn import pad_sequence
 
 model_dict = {
     "hf_wav2vec2transducer": HFWav2Vec2Transducer,
@@ -73,14 +78,12 @@ def init_data(partition, rank, num_gpus, **kwargs):
 
     num_workers = data_kwargs["data_loader"]["num_workers"]
     num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus)
-    largs = ({
-        "num_workers": num_workers_per_gpu,
-        "pin_memory": True
-    } if num_gpus > 0 else {})
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_sampler=sampler,
-                                              **largs,
-                                              collate_fn=transducer_collate)
+    largs = (
+        {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {}
+    )
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate
+    )
     return data_loader
 
 
@@ -98,7 +101,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs):
 
 
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -106,8 +108,8 @@ def train_model(gpu_id, args):
     kwargs = namespace_to_dict(args)
     torch.manual_seed(args.seed)
     set_float_cpu("float32")
-    #torch.backends.cudnn.deterministic = True
-    #torch.backends.cudnn.benchmark = False
+    # torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.benchmark = False
     torch.backends.cudnn.enabled = False
 
     ddp_args = ddp.filter_ddp_args(**kwargs)
@@ -122,13 +124,16 @@ def train_model(gpu_id, args):
 
     train_loader = init_data(partition="train", **kwargs)
     val_loader = init_data(partition="val", **kwargs)
-    model = init_model(train_loader.dataset.sp.piece_to_id("<blk>"),
-                       train_loader.dataset.sp.get_piece_size(), **kwargs)
+    model = init_model(
+        train_loader.dataset.sp.piece_to_id("<blk>"),
+        train_loader.dataset.sp.get_piece_size(),
+        **kwargs,
+    )
 
     trn_args = Trainer.filter_args(**kwargs["trainer"])
     if rank == 0:
         logging.info("trainer args={}".format(trn_args))
-    metrics = {}  #{"acc": CategoricalAccuracy()}
+    metrics = {}  # {"acc": CategoricalAccuracy()}
     trainer = Trainer(
         model,
         device=device,
@@ -166,8 +171,7 @@ def make_parser(model_class):
         help="num_workers of data loader",
     )
     data_parser = ArgumentParser(prog="")
-    data_parser.add_argument("--train",
-                             action=ActionParser(parser=train_parser))
+    data_parser.add_argument("--train", action=ActionParser(parser=train_parser))
     data_parser.add_argument("--val", action=ActionParser(parser=val_parser))
     parser.add_argument("--data", action=ActionParser(parser=data_parser))
 
@@ -183,34 +187,29 @@ def make_parser(model_class):
         type=str,
     )
 
-    parser.link_arguments("data.train.data_loader.num_workers",
-                          "data.val.data_loader.num_workers")
+    parser.link_arguments(
+        "data.train.data_loader.num_workers", "data.val.data_loader.num_workers"
+    )
 
-    parser.link_arguments("data.train.dataset.bpe_model",
-                          "data.val.dataset.bpe_model")
+    parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model")
 
     model_class.add_class_args(parser, prefix="model")
-    Trainer.add_class_args(parser,
-                           prefix="trainer",
-                           train_modes=model_class.valid_train_modes())
+    Trainer.add_class_args(
+        parser, prefix="trainer", train_modes=model_class.valid_train_modes()
+    )
     ddp.add_ddp_args(parser)
-    parser.add_argument("--seed",
-                        type=int,
-                        default=1123581321,
-                        help="random seed")
-    parser.add_argument("-v",
-                        "--verbose",
-                        dest="verbose",
-                        default=1,
-                        choices=[0, 1, 2, 3],
-                        type=int)
+    parser.add_argument("--seed", type=int, default=1123581321, help="random seed")
+    parser.add_argument(
+        "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int
+    )
 
     return parser
 
 
-if __name__ == "__main__":
+def main():
     parser = ArgumentParser(
-        description="Train Wav2Vec2Transducer model from audio files")
+        description="Train Wav2Vec2Transducer model from audio files"
+    )
     parser.add_argument("--cfg", action=ActionConfigFile)
 
     subcommands = parser.add_subcommands()
@@ -239,3 +238,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     # multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py
index f132a35c..e6dd3d3e 100755
--- a/hyperion/bin/train_wav2vec2xvector.py
+++ b/hyperion/bin/train_wav2vec2xvector.py
@@ -14,6 +14,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
@@ -25,12 +32,6 @@
 )
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 model_dict = {
     "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector,
@@ -40,7 +41,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -83,7 +83,6 @@ def init_model(num_classes, rank, model_class, **kwargs):
 
 
 def train_model(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -105,7 +104,11 @@ def train_model(gpu_id, args):
         logging.info(f"trainer args={trn_args}")
     metrics = {"acc": CategoricalAccuracy()}
     trainer = Trainer(
-        model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args,
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
     )
     trainer.load_last_checkpoint()
     trainer.fit(train_loader, val_loader)
@@ -162,8 +165,7 @@ def make_parser(model_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -193,3 +195,7 @@ def make_parser(model_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_model(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py
index ddf292b8..7373a338 100755
--- a/hyperion/bin/train_wav2xvector.py
+++ b/hyperion/bin/train_wav2xvector.py
@@ -9,6 +9,13 @@
 from pathlib import Path
 
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
@@ -23,12 +30,6 @@
 # from hyperion.torch.models import TransformerXVectorV1 as TFXVec
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (
-    ActionConfigFile,
-    ActionParser,
-    ArgumentParser,
-    namespace_to_dict,
-)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -41,7 +42,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -84,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -107,7 +106,11 @@ def train_xvec(gpu_id, args):
         logging.info("trainer args={}".format(trn_args))
     metrics = {"acc": CategoricalAccuracy()}
     trainer = Trainer(
-        model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args,
+        model,
+        device=device,
+        metrics=metrics,
+        ddp=world_size > 1,
+        **trn_args,
     )
     trainer.load_last_checkpoint()
     trainer.fit(train_loader, val_loader)
@@ -164,8 +167,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Train Wav2XVector from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -194,3 +196,7 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py
index 71bba080..a2acdf4c 100755
--- a/hyperion/bin/train_xvector_from_feats.py
+++ b/hyperion/bin/train_xvector_from_feats.py
@@ -13,6 +13,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import ClassWeightedSeqSampler as Sampler
 from hyperion.torch.data import FeatSeqDataset as SD
@@ -25,8 +32,6 @@
 from hyperion.torch.models import TransformerXVectorV1 as TFXVec
 from hyperion.torch.trainers import XVectorTrainer as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -39,7 +44,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     sd_args = SD.filter_args(**kwargs["dataset"])
     sampler_args = Sampler.filter_args(**kwargs["sampler"])
@@ -80,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -164,8 +167,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Train XVector from audio files")
 
     parser.add_argument("--cfg", action=ActionConfigFile)
@@ -196,3 +198,7 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py
index b2e36cac..c3f6170d 100755
--- a/hyperion/bin/train_xvector_from_wav.py
+++ b/hyperion/bin/train_xvector_from_wav.py
@@ -9,6 +9,13 @@
 from pathlib import Path
 
 import torch
+from jsonargparse import (
+    ActionConfigFile,
+    ActionParser,
+    ArgumentParser,
+    namespace_to_dict,
+)
+
 from hyperion.hyp_defs import config_logger, set_float_cpu
 from hyperion.torch.data import AudioDataset as AD
 from hyperion.torch.data import SegSamplerFactory
@@ -22,8 +29,6 @@
 from hyperion.torch.narchs import AudioFeatsMVN as AF
 from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer
 from hyperion.torch.utils import ddp
-from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser,
-                          namespace_to_dict)
 
 xvec_dict = {
     "resnet": RXVec,
@@ -36,7 +41,6 @@
 
 
 def init_data(partition, rank, num_gpus, **kwargs):
-
     kwargs = kwargs["data"][partition]
     ad_args = AD.filter_args(**kwargs["dataset"])
     sampler_args = kwargs["sampler"]
@@ -90,7 +94,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs):
 
 
 def train_xvec(gpu_id, args):
-
     config_logger(args.verbose)
     del args.verbose
     logging.debug(args)
@@ -176,8 +179,7 @@ def make_parser(xvec_class):
     return parser
 
 
-if __name__ == "__main__":
-
+def main():
     parser = ArgumentParser(description="Train XVector from audio files")
     parser.add_argument("--cfg", action=ActionConfigFile)
 
@@ -206,3 +208,7 @@ def make_parser(xvec_class):
     # torch docs recommend using forkserver
     multiprocessing.set_start_method("forkserver")
     train_xvec(gpu_id, args_sc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py
index 14b1b35f..aa5ac653 100644
--- a/hyperion/io/__init__.py
+++ b/hyperion/io/__init__.py
@@ -16,10 +16,10 @@
 from .hyp_data_reader import *
 from .hyp_data_writer import *
 from .kaldi_data_reader import *
-from .packed_audio_reader import (RandomAccessPackedAudioReader,
-                                  SequentialPackedAudioReader)
+from .packed_audio_reader import (
+    RandomAccessPackedAudioReader,
+    SequentialPackedAudioReader,
+)
 from .packed_audio_writer import PackedAudioWriter
 from .segment_vad_reader import SegmentVADReader
 from .vad_rw_factory import VADReaderFactory
-
-# from .queues import *
diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py
index 2186522e..d1cf7f68 100644
--- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py
+++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py
@@ -8,7 +8,6 @@
 
 from ....hyp_defs import float_cpu
 from ....utils.math_funcs import logsumexp, softmax
-from ....utils.queues import GeneratorQueue
 from ..core import PDF
 
 
@@ -110,86 +109,6 @@ def fit(
         else:
             return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0]
 
-    def fit_generator(
-        self,
-        generator,
-        train_steps,
-        epochs=10,
-        val_data=None,
-        val_steps=0,
-        max_queue_size=10,
-        workers=1,
-        use_multiprocessing=False,
-    ):
-        """Trains the model from data read by a generator function.
-           This function is deprecated.
-
-        Args:
-          generator: train data generator function returning a tuple
-                (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x.
-          train_steps: number of training steps / epoch
-          epochs: number of epochs.
-          val_data: val. data generator function returning a tuple
-                (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x.
-          val_steps: number of validation steps / epoch
-          max_queue_size: max. size of the generator queue.
-          workers: number of workers in the generator.
-          use_multiprocessing: use multi-processing in the generator queue.
-
-        Returns:
-          log p(X) of the training data.
-          log p(x) per sample.
-          log p(X) of the val. data, if present.
-          log p(x) of the val. data per sample, if present.
-        """
-
-        do_validation = bool(val_data)
-        val_gen = hasattr(val_data, "next") or hasattr(val_data, "__next__")
-        if val_gen and not val_steps:
-            raise ValueError(
-                "When using a generator for validation data, "
-                "you must specify a value for "
-                "`val_steps`."
-            )
-
-        if do_validation and not val_gen:
-            x, u_x_val, sample_weight_val = self.tuple2data(val_data)
-            log_h_val = self.accum_log_h(x, sample_weight_val)
-
-        elbo = np.zeros((epochs,), dtype=float_cpu())
-        elbo_val = np.zeros((epochs,), dtype=float_cpu())
-        for epoch in range(epochs):
-            N, u_x, log_h = self.Estep_generator(
-                generator,
-                train_steps,
-                return_log_h=True,
-                max_queue_size=max_queue_size,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-            )
-
-            self.Mstep(N, u_x)
-            elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h)
-
-            if val_data is not None:
-                if val_gen:
-                    N, u_x, log_h_val = self.Estep_generator(
-                        val_data,
-                        train_steps,
-                        return_log_h=True,
-                        max_queue_size=max_queue_size,
-                        workers=workers,
-                        use_multiprocessing=use_multiprocessing,
-                    )
-                else:
-                    N, u_x = self.Estep(val_data, u_x_val, sample_weight_val)
-                elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val)
-
-        if val_data is None:
-            return elbo, elbo / x.shape[0]
-        else:
-            return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0]
-
     def log_h(self, x):
         """Computes log h(x) of the exp. family."""
         return 0
@@ -404,7 +323,6 @@ def _accum_suff_stats_segments_prob_1batch(
     def _accum_suff_stats_segments_prob_nbatches(
         self, x, prob, sample_weight, batch_size
     ):
-
         sw_i = None
         for i1 in range(0, x.shape[0], batch_size):
             i2 = np.minimum(i1 + batch_size, x.shape[0])
@@ -458,7 +376,6 @@ def accum_suff_stats_sorttime(
     def _accum_suff_stats_sorttime_1batch(
         self, x, frame_length, frame_shift, u_x=None, sample_weight=None
     ):
-
         K = len(self.pi)
         num_frames = x.shape[0]
         num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1))
@@ -494,7 +411,6 @@ def _accum_suff_stats_sorttime_1batch(
     def _accum_suff_stats_sorttime_nbatches(
         self, x, frame_length, frame_shift, sample_weight, batch_size
     ):
-
         K = len(self.pi)
         num_frames = x.shape[0]
         num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1))
@@ -539,65 +455,6 @@ def Estep(self, x, u_x=None, sample_weight=None, batch_size=None):
         """
         return self.accum_suff_stats(x, u_x, sample_weight, batch_size)
 
-    def Estep_generator(
-        self,
-        generator,
-        num_steps,
-        return_log_h,
-        max_queue_size=10,
-        workers=1,
-        use_multiprocessing=False,
-    ):
-        """Expectation step, where data is read from a generator function.
-
-        Args:
-          generator: data generator function returning a tuple
-                (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x.
-          num_steps: number of steps / epoch
-          return_log_h: returns accumlated log h(x).
-          max_queue_size: max. size of the generator queue.
-          workers: number of workers in the generator.
-          use_multiprocessing: use multi-processing in the generator queue.
-
-        Returns:
-          N zero order sufficient statistics (number of samples).
-          Accumlated sufficient statistics \sum u(x).
-          Accumlated log h(x) (optional).
-        """
-        wait_time = 0.01  # in secs
-        queue = None
-        N = None
-        acc_u_x = None
-        log_h = 0
-        try:
-            queue = GeneratorQueue(
-                generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time
-            )
-            queue.start(workers=workers, max_queue_size=max_queue_size)
-            queue_generator = queue.get()
-
-            cur_step = 0
-            for cur_step in range(num_steps):
-                data = next(queue_generator)
-                x, u_x, sample_weight = self.tuple2data(data)
-                N_i, u_x_i = self.Estep(x, u_x, sample_weight)
-                if return_log_h:
-                    log_h += self.accum_log_h(x)
-                if cur_step == 0:
-                    N = N_i
-                    acc_u_x = u_x_i
-                else:
-                    N += N_i
-                    acc_u_x += u_x_i
-        finally:
-            if queue is not None:
-                queue.stop()
-
-        if return_log_h:
-            return N, acc_u_x, log_h
-        else:
-            return N, acc_u_x
-
     def sum_suff_stats(self, N, u_x):
         """Sums suff. stats from muttiple sub-processes.
 
@@ -754,28 +611,6 @@ def get_config(self):
         base_config = super(ExpFamilyMixture, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    @staticmethod
-    def tuple2data(data):
-        if isinstance(data, tuple):
-            if len(data) == 2:
-                x, u_x = data
-                if u_x.ndim == 2:
-                    sample_weight = None
-                elif u_x.ndim == 1:
-                    sample_weight = u_x
-                    u_x = None
-                else:
-                    raise ValueError("Generator output: " + str(data))
-            elif len(data) == 3:
-                x, u_x, sample_weight = data
-            else:
-                raise ValueError("Generator output: " + str(data))
-        else:
-            x = data
-            u_x = None
-            sample_weight = None
-        return x, u_x, sample_weight
-
     @staticmethod
     def compute_A_nat(eta):
         """Computes A_theta from the natural param."""
diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py
index 7a2e82f8..3f7b2ec7 100644
--- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py
+++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py
@@ -7,7 +7,11 @@
 from functools import partial
 
 import torch
-from torch._six import inf
+
+try:
+    from torch import inf
+except:
+    from torch._six import inf
 
 from .lr_scheduler import LRScheduler
 
diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py
deleted file mode 100644
index 8bfd0166..00000000
--- a/hyperion/utils/queues.py
+++ /dev/null
@@ -1,287 +0,0 @@
-"""
- Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
- Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-"""
-
-import copy
-import multiprocessing
-import threading
-import time
-import warnings
-from abc import abstractmethod
-
-import numpy as np
-import six
-
-try:
-    import queue
-except ImportError:
-    import Queue as queue
-
-
-class SequenceQueue(object):
-    """Base class to enqueue inputs.
-
-    The task of an Queue is to use parallelism to speed up preprocessing.
-    This is done with processes or threads.
-
-    # Examples
-
-    ```python
-    enqueuer = SequenceQueue(...)
-    enqueuer.start()
-    datas = enqueuer.get()
-    for data in datas:
-        # Use the inputs; training, evaluating, predicting.
-        # ... stop sometime.
-    enqueuer.close()
-    ```
-
-    The `enqueuer.get()` should be an infinite stream of datas.
-
-    """
-
-    @abstractmethod
-    def is_running(self):
-        raise NotImplemented
-
-    @abstractmethod
-    def start(self, workers=1, max_queue_size=10):
-        """Starts the handler's workers.
-
-        # Arguments
-            workers: number of worker threads
-            max_queue_size: queue size
-                (when full, threads could block on `put()`).
-        """
-        raise NotImplemented
-
-    @abstractmethod
-    def stop(self, timeout=None):
-        """Stop running threads and wait for them to exit, if necessary.
-
-        Should be called by the same thread which called start().
-
-        # Arguments
-            timeout: maximum time to wait on thread.join()
-        """
-        raise NotImplemented
-
-    @abstractmethod
-    def get(self):
-        """Creates a generator to extract data from the queue.
-
-        Skip the data if it is `None`.
-
-        # Returns
-            Generator yielding tuples `(inputs, targets)`
-                or `(inputs, targets, sample_weights)`.
-        """
-        raise NotImplemented
-
-
-class OrderedQueue(SequenceQueue):
-    """Builds a Queue from a Sequence.
-
-    Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-    # Arguments
-        sequence: A `keras.utils.data_utils.Sequence` object.
-        use_multiprocessing: use multiprocessing if True, otherwise threading
-        scheduling: Sequential querying of datas if 'sequential', random otherwise.
-    """
-
-    def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"):
-        self.sequence = sequence
-        self.use_multiprocessing = use_multiprocessing
-        self.scheduling = scheduling
-        self.workers = 0
-        self.executor = None
-        self.queue = None
-        self.run_thread = None
-        self.stop_signal = None
-
-    def is_running(self):
-        return self.stop_signal is not None and not self.stop_signal.is_set()
-
-    def start(self, workers=1, max_queue_size=10):
-        """Start the handler's workers.
-
-        # Arguments
-            workers: number of worker threads
-            max_queue_size: queue size
-                (when full, workers could block on `put()`)
-        """
-        if self.use_multiprocessing:
-            self.executor = multiprocessing.Pool(workers)
-        else:
-            self.executor = ThreadPool(workers)
-        self.queue = queue.Queue(max_queue_size)
-        self.stop_signal = threading.Event()
-        self.run_thread = threading.Thread(target=self._run)
-        self.run_thread.daemon = True
-        self.run_thread.start()
-
-    def _run(self):
-        """Function to submit request to the executor and queue the `Future` objects."""
-        sequence = list(range(len(self.sequence)))
-        while True:
-            if self.scheduling is not "sequential":
-                random.shuffle(sequence)
-            for i in sequence:
-                if self.stop_signal.is_set():
-                    return
-                self.queue.put(
-                    self.executor.apply_async(get_index, (self.sequence, i)), block=True
-                )
-
-    def get(self):
-        """Creates a generator to extract data from the queue.
-
-        Skip the data if it is `None`.
-
-        # Returns
-            Generator yielding tuples (inputs, targets)
-                or (inputs, targets, sample_weights)
-        """
-        try:
-            while self.is_running():
-                inputs = self.queue.get(block=True).get()
-                if inputs is not None:
-                    yield inputs
-        except Exception as e:
-            self.stop()
-            raise StopIteration(e)
-
-    def stop(self, timeout=None):
-        """Stops running threads and wait for them to exit, if necessary.
-
-        Should be called by the same thread which called `start()`.
-
-        # Arguments
-            timeout: maximum time to wait on `thread.join()`
-        """
-        self.stop_signal.set()
-        with self.queue.mutex:
-            self.queue.queue.clear()
-            self.queue.unfinished_tasks = 0
-            self.queue.not_full.notify()
-        self.executor.close()
-        self.executor.join()
-        self.run_thread.join(timeout)
-
-
-class GeneratorQueue(SequenceQueue):
-    """Builds a queue out of a data generator.
-
-    Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-    # Arguments
-        generator: a generator function which endlessly yields data
-        use_multiprocessing: use multiprocessing if True, otherwise threading
-        wait_time: time to sleep in-between calls to `put()`
-        random_seed: Initial seed for workers,
-            will be incremented by one for each workers.
-    """
-
-    def __init__(
-        self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None
-    ):
-        self.wait_time = wait_time
-        self._generator = generator
-        self._use_multiprocessing = use_multiprocessing
-        self._threads = []
-        self._stop_event = None
-        self.queue = None
-        self.random_seed = random_seed
-
-    def start(self, workers=1, max_queue_size=10):
-        """Kicks off threads which add data from the generator into the queue.
-
-        # Arguments
-            workers: number of worker threads
-            max_queue_size: queue size
-                (when full, threads could block on `put()`)
-        """
-
-        def data_generator_task():
-            while not self._stop_event.is_set():
-                try:
-                    if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
-                        generator_output = next(self._generator)
-                        self.queue.put(generator_output)
-                    else:
-                        time.sleep(self.wait_time)
-                except Exception:
-                    self._stop_event.set()
-                    raise
-
-        try:
-            if self._use_multiprocessing:
-                self.queue = multiprocessing.Queue(maxsize=max_queue_size)
-                self._stop_event = multiprocessing.Event()
-            else:
-                self.queue = queue.Queue()
-                self._stop_event = threading.Event()
-
-            for _ in range(workers):
-                if self._use_multiprocessing:
-                    # Reset random seed else all children processes
-                    # share the same seed
-                    np.random.seed(self.random_seed)
-                    thread = multiprocessing.Process(target=data_generator_task)
-                    thread.daemon = True
-                    if self.random_seed is not None:
-                        self.random_seed += 1
-                else:
-                    thread = threading.Thread(target=data_generator_task)
-                self._threads.append(thread)
-                thread.start()
-        except:
-            self.stop()
-            raise
-
-    def is_running(self):
-        return self._stop_event is not None and not self._stop_event.is_set()
-
-    def stop(self, timeout=None):
-        """Stops running threads and wait for them to exit, if necessary.
-
-        Should be called by the same thread which called `start()`.
-
-        # Arguments
-            timeout: maximum time to wait on `thread.join()`.
-        """
-        if self.is_running():
-            self._stop_event.set()
-
-        for thread in self._threads:
-            if thread.is_alive():
-                if self._use_multiprocessing:
-                    thread.terminate()
-                else:
-                    thread.join(timeout)
-
-        if self._use_multiprocessing:
-            if self.queue is not None:
-                self.queue.close()
-
-        self._threads = []
-        self._stop_event = None
-        self.queue = None
-
-    def get(self):
-        """Creates a generator to extract data from the queue.
-
-        Skip the data if it is `None`.
-
-        # Returns
-            A generator
-        """
-        while self.is_running():
-            if not self.queue.empty():
-                inputs = self.queue.get()
-                if inputs is not None:
-                    yield inputs
-            else:
-                time.sleep(self.wait_time)
diff --git a/setup.py b/setup.py
index 9780586d..e1fb35cc 100644
--- a/setup.py
+++ b/setup.py
@@ -15,15 +15,26 @@
 # limitations under the License.
 #
 
-import setuptools
 from pathlib import Path
 
+import setuptools
+
 project_root = Path(__file__).parent
 
-with open(project_root / "apps.txt") as f:
-    apps = f.read().splitlines()
+# with open(project_root / "apps.txt") as f:
+#     apps = f.read().splitlines()
 
-apps = [str(project_root / "hyperion" / "bin" / app) for app in apps]
+# apps = [str(project_root / "hyperion" / "bin" / app) for app in apps]
+binaries = (project_root / "hyperion" / "bin").glob("*.py")
+console_scripts = []
+for binary in binaries:
+    stem = binary.stem
+    script_name = stem.replace("hyperion_", "").replace("_", "-")
+    if script_name[0] == "-":
+        continue
+    module = f"hyperion.bin.{stem}:main"
+    console_script = f"hyperion-{script_name} = {module}"
+    console_scripts.append(console_script)
 
 with open(project_root / "requirements.txt") as f:
     requirements = f.read().splitlines()
@@ -77,10 +88,22 @@ def get_version():
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
     ],
     python_requires=">=3.7",
     install_requires=requirements,
-    scripts=apps,
+    entry_points={
+        "console_scripts": console_scripts,
+    }
+    # entry_points={
+    #     "console_scripts": [
+    #         "hyperion-prepare-data = hyperion.bin.prepare_data:main",
+    #         "hyperion-train-wav2xvector = hyperion.bin.train_wav2xvector:main",
+    #     ]
+    # },
+    # scripts=apps,
 )

From 610547682764789844af201c1a16bccc6b8d34ab Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Sun, 10 Sep 2023 20:01:36 -0400
Subject: [PATCH 77/89] make it work with cuda 11

---
 README.md                   | 11 +++++++++--
 hyp_utils/conda_env.sh      | 32 +++++++++++++++++---------------
 hyperion/torch/utils/ddp.py | 13 +++++--------
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 4838157b..d56406d7 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,21 @@ The full API is described in the documentation page [https://hyperion-ml.readthe
 ### Prerequisites
 
     We use anaconda or miniconda, though you should be able to make it work in other python distributions
-    To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.:
+    To start, you should create a new enviroment and install PyTorch:
 ```
 conda create --name ${your_env} python=3.11
 conda activate ${your_env}
-conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch
 conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 ```
 
+For systems with cuda 10.2 driver:
+```
+conda create --name ${your_env} python=3.10
+conda activate ${your_env}
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch
+```
+
+
 ### Installing Hyperion
 
 - First, clone the repo:
diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh
index 8d5c67c1..90ffa369 100755
--- a/hyp_utils/conda_env.sh
+++ b/hyp_utils/conda_env.sh
@@ -52,22 +52,24 @@ fi
 # echo "LRU_CACHE_CAPACITY=$LRU_CACHE_CAPACITY"
 
 conda activate $conda_env
-command="python"
+command=""
 if [ $num_gpus -gt 0 ];then
-  # set CUDA_VISIBLE_DEVICES
-  if [ ! -z "$SGE_HGR_gpu" ]; then
-    echo "SGE_HGR_gpu=$SGE_HGR_gpu"
-    export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g')
-  else
-    # seach location of free-gpu program in the PATH or hyp_utils directory
-    free_gpu=$(which free-gpu)
-    if [ -z "$free_gpu" ];then
-      free_gpu=$(which hyp_utils/free-gpu)
-    fi
-    
-    if [ ! -z "$free_gpu" ];then
-      # if free-gpu found set env var, otherwise we assume that you can use any gpu
-      export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus)
+  if [ -z "$CUDA_VISIBLE_DEVICES" ];then
+    # set CUDA_VISIBLE_DEVICES
+    if [ ! -z "$SGE_HGR_gpu" ]; then
+      echo "SGE_HGR_gpu=$SGE_HGR_gpu"
+      export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g')
+    else
+      # seach location of free-gpu program in the PATH or hyp_utils directory
+      free_gpu=$(which free-gpu)
+      if [ -z "$free_gpu" ];then
+	free_gpu=$(which hyp_utils/free-gpu)
+      fi
+      
+      if [ ! -z "$free_gpu" ];then
+	# if free-gpu found set env var, otherwise we assume that you can use any gpu
+	export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus)
+      fi
     fi
   fi
   echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py
index 1aefb3d4..4f006c0a 100644
--- a/hyperion/torch/utils/ddp.py
+++ b/hyperion/torch/utils/ddp.py
@@ -6,19 +6,16 @@
 import logging
 import os
 
-from fairscale.nn.data_parallel import \
-    FullyShardedDataParallel as FullyShardedDDP
-from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
-
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
+from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
 
 from .devices import open_device
 
 
 def add_ddp_args(parser):
-
     parser.add_argument(
         "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu"
     )
@@ -50,7 +47,6 @@ def filter_ddp_args(**kwargs):
 def ddp_init(
     gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None
 ):
-
     rank = node_id * num_gpus + gpu_id
     world_size = num_nodes * num_gpus
 
@@ -62,15 +58,16 @@ def ddp_init(
     os.environ["MASTER_PORT"] = master_port
 
     logging.info(
-        f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" 
+        f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}"
     )
     dist.init_process_group(
         "nccl",
         rank=rank,
         world_size=world_size,
     )
+    torch.cuda.set_device(rank)
     torch.tensor([0]).to(gpu_id)
-    device = torch.device('cuda', gpu_id)
+    device = torch.device("cuda", gpu_id)
     return device, rank, world_size
     # return gpu_id, rank, world_size
 

From 392cd30f6bae594e9121bde48379aae787d16e6f Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Mon, 11 Sep 2023 11:41:35 -0400
Subject: [PATCH 78/89] started vox/v2.1 recipe and fix some readmes

---
 egs/voxceleb/v1.1/README.md                   |   2 +
 egs/voxceleb/v1.2/README.md                   | 249 ++++++--------
 .../train_cfwseresnet34_xvec_stage1_v3.0.yaml |  72 ++++
 .../train_cfwseresnet34_xvec_stage2_v3.0.yaml |  69 ++++
 .../train_cwseresnet34_xvec_stage1_v3.0.yaml  |  72 ++++
 .../train_cwseresnet34_xvec_stage2_v3.0.yaml  |  69 ++++
 .../train_fwseresnet34_xvec_stage1_v3.0.yaml  |  72 ++++
 .../train_fwseresnet34_xvec_stage2_v3.0.yaml  |  69 ++++
 ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml |  73 ++++
 ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml |  69 ++++
 .../conf/train_resnet34_xvec_stage1_v3.0.yaml |  71 ++++
 .../conf/train_resnet34_xvec_stage2_v3.0.yaml |  69 ++++
 .../train_tseresnet34_xvec_stage1_v3.0.yaml   |  72 ++++
 .../train_tseresnet34_xvec_stage2_v3.0.yaml   |  69 ++++
 .../config_fbank80_stmn_cfwseresnet34.v3.0.sh |  44 +++
 .../config_fbank80_stmn_cwseresnet34.v3.0.sh  |  45 +++
 .../config_fbank80_stmn_fwseresnet34.v3.0.sh  |  44 +++
 ...onfig_fbank80_stmn_idrnd_resnet100.v3.0.sh |  44 +++
 .../config_fbank80_stmn_resnet34.v3.0.sh      |  44 +++
 .../config_fbank80_stmn_tseresnet34.v3.0.sh   |  44 +++
 egs/voxceleb/v2.1/cmd.sh                      |  28 ++
 egs/voxceleb/v2.1/conf/clsp.conf              |  11 +
 egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf    |  11 +
 egs/voxceleb/v2.1/conf/coe_gpu_long.conf      |  13 +
 egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf       |  11 +
 egs/voxceleb/v2.1/conf/coe_gpu_short.conf     |  11 +
 egs/voxceleb/v2.1/conf/coe_gpu_v100.conf      |  11 +
 egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml  |  35 ++
 ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml |  59 ++++
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml |  63 ++++
 ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml |  73 ++++
 ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml |  63 ++++
 ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml |  73 ++++
 ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml |  63 ++++
 ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml |  73 ++++
 ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml |  59 ++++
 ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml |  63 ++++
 ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml |  73 ++++
 egs/voxceleb/v2.1/conf/vad_16k.yaml           |   8 +
 ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml |  45 +++
 ...wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml |  44 +++
 .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml |  44 +++
 .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml  |  45 +++
 .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml    |  44 +++
 .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml    |  45 +++
 .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml  |  44 +++
 egs/voxceleb/v2.1/datapath.sh                 |  23 ++
 egs/voxceleb/v2.1/default_config.sh           |   1 +
 ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh |  54 +++
 ...g_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh |  54 +++
 ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh |  54 +++
 ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh |  54 +++
 ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh |  54 +++
 ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh |  54 +++
 .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh  |  54 +++
 egs/voxceleb/v2.1/hyp_utils                   |   1 +
 egs/voxceleb/v2.1/path.sh                     |   5 +
 egs/voxceleb/v2.1/run_001_prepare_data.sh     |  46 +++
 egs/voxceleb/v2.1/run_002_compute_evad.sh     |  66 ++++
 .../v2.1/run_003_prepare_noises_rirs.sh       | 102 ++++++
 .../v2.1/run_004_prepare_xvec_train_data.sh   |  76 +++++
 egs/voxceleb/v2.1/run_005_train_xvector.sh    |  78 +++++
 egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 103 ++++++
 egs/voxceleb/v2.1/run_007_eval_be.sh          | 321 ++++++++++++++++++
 egs/voxceleb/v2/README.md                     |  10 +-
 egs/voxceleb/v2/default_config.sh             |   2 +-
 71 files changed, 3829 insertions(+), 152 deletions(-)
 create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
 create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
 create mode 100755 egs/voxceleb/v2.1/cmd.sh
 create mode 100644 egs/voxceleb/v2.1/conf/clsp.conf
 create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf
 create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_long.conf
 create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf
 create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_short.conf
 create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_v100.conf
 create mode 100644 egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/vad_16k.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/datapath.sh
 create mode 120000 egs/voxceleb/v2.1/default_config.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh
 create mode 120000 egs/voxceleb/v2.1/hyp_utils
 create mode 100755 egs/voxceleb/v2.1/path.sh
 create mode 100755 egs/voxceleb/v2.1/run_001_prepare_data.sh
 create mode 100755 egs/voxceleb/v2.1/run_002_compute_evad.sh
 create mode 100755 egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh
 create mode 100755 egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh
 create mode 100755 egs/voxceleb/v2.1/run_005_train_xvector.sh
 create mode 100755 egs/voxceleb/v2.1/run_006_extract_xvectors.sh
 create mode 100755 egs/voxceleb/v2.1/run_007_eval_be.sh

diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md
index 3b9eeaa9..efdb77c1 100644
--- a/egs/voxceleb/v1.1/README.md
+++ b/egs/voxceleb/v1.1/README.md
@@ -1,5 +1,7 @@
 # VoxCeleb V1.1
 
+This recipe will be deprecated, use V1.2
+
 Recipe for the VoxCeleb Speaker Verification Task
 
 ## Differences w.r.t VoxCeleb V1 recipe
diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md
index 1ee9468f..6e8ba07a 100644
--- a/egs/voxceleb/v1.2/README.md
+++ b/egs/voxceleb/v1.2/README.md
@@ -1,4 +1,4 @@
-# VoxCeleb V1.1
+# VoxCeleb V1.2
 
 Recipe for the VoxCeleb Speaker Verification Task
 
@@ -9,7 +9,7 @@ In recipe version V1:
    - Augmentation is performed using Kaldi scripts and wav-reverbate tool
    - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files.
 
-In this recipe:
+In V1.1:
    - We compute speech augmentations and acoustic features are computed always on-the-fly,
      we don't dump any features to disk. 
    - Augmentation is performed using Hyperin SpeechAugment class.
@@ -18,6 +18,11 @@ In this recipe:
      which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe.
    - Babble noise is created offline by mixing 3-10 single speaker files.
 
+In V1.2:
+   - Feaure extractor is embedded into the pytorch model in classes derived from Wav2XVector base class.
+   - Kaldi format is replaced by new format based on pandas tables
+   - Kaldi style bash scripts are removed and replaced by python scripts
+   - Most python scripts are called using Hyperion entry points 
 
 ## Citing
 
@@ -30,13 +35,11 @@ In this recipe:
 ## Test data
 
    - Test data is VoxCeleb 1
-   - We evaluate 6 conditions:
+   - We evaluate the 3 conditions (with cleaned lists):
       - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers
-      - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors
       - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1
-      - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors
       - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials.
-      - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors
+ 
 
 ## Usage
 
@@ -44,9 +47,9 @@ In this recipe:
    - By default it will use Light ResNet (16 base channels)
    - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as
 ```bash
-run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
-run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true
-run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
+run_005_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
+run_006_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true
+run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
 ```
 
    - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh`
@@ -66,25 +69,26 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
       - Creates Babble noise from MUSAN speech to be used by SpeechAugment class.
       - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class.
 
-   - `run_010_prepare_xvec_train_data.sh`
+   - `run_004_prepare_xvec_train_data.sh`
       - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac.
       - Removes silence from the audios
       - Removes utterances shorter than 4secs and speakers with less than 8 utterances.
       - Creates training and validation lists for x-vector training
 
-   - `run_011_train_xvector.sh`
+   - `run_005_train_xvector.sh`
       - Trains the x-vector network
 
-   - `run_030_extract_xvectors.sh`
+   - `run_006_extract_xvectors.sh`
       - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training
       - Exctracts x-vectors for VoxCeleb1 test sets
 
-   - `run_040_eval_be.sh`
+   - `run_007_eval_be.sh`
       - Trains PLDA and evals PLDA and cosine scoring back-ends
 
 
 ## Results
 
+
 ### VoxCeleb 1 Original-Clean trial list
 
 | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
@@ -95,9 +99,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 |
 | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 |
 | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 |
-| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF | || |
+| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 |
+| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 |
+| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 |
+| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 |
+| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 |
+| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 |
+| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 |
+| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062|
+| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 |
+| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 |
+| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 |
+| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 |
+| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 |
+| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 |
+| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 |
+| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 |
+| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 |
+| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 |
+
 
 ### VoxCeleb 1 Entire-Clean trial list
 
@@ -109,9 +132,27 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 |
 | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 |
 | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 |
-| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF | | | |
+| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 |
+| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 |
+| | | | Cosine + QMF | 0.77 | 0.046 | 0.082  |
+| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 |
+| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087|
+| | | | Cosine + QMF | 0.80 | 0.050  | 0.081 |
+| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 |
+| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 |
+| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 |
+| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 |
+| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 |
+| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 |
+| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 |
+| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076|
+| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 |
+| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 |
+| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 |
+| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 |
 
 ### VoxCeleb 1 Hard-Clean trial list
 
@@ -123,9 +164,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 |
 | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 |
 | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 |
-| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF | | | |
+| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 |
+| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 |
+| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 |
+| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 |  0.165 |
+| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 |
+| | | | Cosine + QMF | 1.44 | 0.085  | 0.139 |
+| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 |
+| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 |
+| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 |
+| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 |
+| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 |
+| | | | Cosine + AS-Norm |  1.58 | 0.092 | 0.152 |
+| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 |
+| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 |
+| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 |
+| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 |
+| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 |
+| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 |
+
 
 ### VoxSRC2022 dev
 
@@ -137,127 +197,24 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 |
 | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 |
 | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 |
-| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | ||  |
-| | | | Cosine + AS-Norm | | | |
-| | | | Cosine + QMF | | | |
-
-## Results before 2023
-
-### VoxCeleb 1 Original-Clean trial list
-
-| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
-| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 |
-| | | | Cosine | 2.04 | 0.138 | 0.210 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA |  1.35 | 0.091 | 0.159 |
-| | | | Cosine |  1.22 | 0.082 | 0.129 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA |  1.30 | 0.090 | 0.160 |
-| | | | Cosine |  1.44 | 0.100 | 0.173 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 |
-| | | | Cosine |  1.17 | 0.081 | 0.110 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 |
-| | | | Cosine | 1.31 | 0.080 | 0.139 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 |
-| | | | Cosine | 1.23 | 0.083 | 0.136 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  1.20 | 0.095 | 0.156 |
-| | | | Cosine | 1.29 | 0.089 | 0.146 |
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.20 | 0.084 | 0.136 |
-| | | | Cosine | 1.18 | 0.078 | 0.115 |
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.11 | 0.084 | 0.145 |
-| | | | Cosine | 1.12 | 0.073 | 0.131 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  1.53 | 0.104 | 0.189 |
-| | | | Cosine | 1.31 | 0.084 | 0.132 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  0.98 | 0.066 | 0.116 |
-| | | | Cosine | 1.12 | 0.071 | 0.103 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  1.05 | 0.077 | 0.123 |
-| | | | Cosine | 0.96 | 0.065 | 0.110 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  1.04 | 0.071 | 0.118 |
-| | | | Cosine | 0.93 | 0.067 | 0.108 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA |  0.90 | 0.067 | 0.118 |
-| | | | Cosine | 0.85 | 0.060 | 0.094 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 |
-| | | | Cosine | 1.29 | 0.084 | 0.140 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 |
-
-
-### VoxCeleb 1 Entire-Clean trial list
-
-| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
-| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 |
-| | | | Cosine | 1.93 | 0.122 | 0.201 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 |
-| | | | Cosine | 1.24 | 0.080 | 0.136 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 |
-| | | | Cosine | 1.30 | 0.082 | 0.150 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 |
-| | | | Cosine | 1.09 | 0.071 | 0.124 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 |
-| | | | Cosine | 1.15 | 0.076 | 0.132 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 |
-| | | | Cosine | 1.27 | 0.082 | 0.148 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  1.31 | 0.086 | 0.149 |
-| | | | Cosine | 1.22 | 0.079 | 0.134 |
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.27 | 0.082 | 0.145 |
-| | | | Cosine | 1.16 | 0.074 | 0.130 |
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  1.23 | 0.077 | 0.136 |
-| | | | Cosine | 1.11 | 0.071 | 0.125 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  1.46 | 0.097 | 0.173 |
-| | | | Cosine | 1.24 | 0.080 | 0.140 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  1.11 | 0.071 | 0.127 |
-| | | | Cosine | 1.05 | 0.067 | 0.117 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  1.23 | 0.078 | 0.134 |
-| | | | Cosine | 1.05 | 0.069 | 0.121 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  1.18 | 0.075 | 0.131 |
-| | | | Cosine | 0.98 | 0.063 | 0.110 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA |  1.17 | 0.072 | 0.123 |
-| | | | Cosine | 0.94 | 0.061 | 0.107 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 |
-| | | | Cosine | 1.27 | 0.079 | 0.142 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 |
-
-
-### VoxCeleb 1 Hard-Clean trial list
-
-| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) |
-| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: |
-| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 |
-| | | | Cosine | 3.27 | 0.188 | 0.303 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 |
-| | | | Cosine | 2.32 | 0.139 | 0.232 |
-| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 |
-| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 |
-| | | | Cosine | 2.33 | 0.142 | 0.235 |
-| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 |
-| | | | Cosine | 2.14 | 0.126 | 0.203 |
-| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121 <br> ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 |
-| | | | Cosine | 2.11 | 0.127 | 0.205 |
-| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121 <br> Instance-Norm with affine transform in Encoder <br> Layer-Norm in head <br> ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 |
-| | | | Cosine | 2.33 | 0.141 | 0.232 |
-| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA |  2.42 | 0.144 | 0.245 |
-| | | | Cosine | 2.26 | 0.133 | 0.224
-| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  2.39 | 0.141 | 0.235 |
-| | | | Cosine | 2.17 | 0.128 | 0.215
-| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA |  2.28 | 0.131 | 0.225 |
-| | | | Cosine | 2.11 | 0.124 | 0.204 |
-| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16 <br> ArcFace s=30/m=0.3 | PLDA |  2.77 | 0.172 | 0.271 |
-| | | | Cosine | 2.45 | 0.141 | 0.225 |
-| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256 <br> ArcFace s=30/m=0.3 | PLDA |  2.07 | 0.124 | 0.201 |
-| | | | Cosine | 1.95 | 0.113 | 0.181 |
-| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA |  2.34 | 0.136 | 0.230 |
-| | | | Cosine | 1.99 | 0.119 | 0.196 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA |  2.18 | 0.127 | 0.211 |
-| | | | Cosine | 1.89 | 0.112 | 0.184 |
-| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA |  2.14 | 0.125 | 0.209 |
-| | | | Cosine | 1.84 | 0.110 | 0.186 |
-| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 |
-| | | | Cosine | 2.26 | 0.134 | 0.214 |
-| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 |
-| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 |
-| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 |
+| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 |
+| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 |
+| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 |
+| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 |
+| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 |
+| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 |
+| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 |
+| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 |
+| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 |
+| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 |
+| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 |
+| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 |
+| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 |
+| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 |
+| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 |
+| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 |
+| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 |
+| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 |
+| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 |
+| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 |
+| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 |
diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..f4306e2e
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: cfwseresnet34
+    in_channels: 1
+    in_feats: 80
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.1
+    norm_before: false
+    hid_act: swish
+    se_r: 32
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..0923a608
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..b5458f9d
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model: 
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: seresnet34
+    in_channels: 1
+    in_feats: 80
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.1
+    norm_before: false
+    hid_act: swish
+    se_r: 32
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 25
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..0923a608
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..01b2cc50
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: fwseresnet34
+    in_channels: 1
+    in_feats: 80
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.1
+    norm_before: false
+    hid_act: swish
+    se_r: 4
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..0923a608
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..74553395
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: fwseidrndresnet100
+    in_channels: 1
+    in_feats: 80
+    conv_channels: 128
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.05
+    se_r: 4
+    norm_before: false
+    hid_act: swish
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 30
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..11d33ae2
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..6659b2f6
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model: 
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: resnet34
+    in_channels: 1
+    in_feats: 80
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.1
+    norm_before: false
+    hid_act: swish
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..0923a608
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
new file mode 100644
index 00000000..58d22733
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml
@@ -0,0 +1,72 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 2.0
+      min_chunk_length: 2.0
+    data_loader:
+      num_workers: 8
+model:
+  feats: fbank80_specaug1_stmn_16k.yaml
+  xvector:
+    resnet_type: tseresnet34
+    in_channels: 1
+    in_feats: 80
+    in_kernel_size: 3
+    in_stride: 1
+    no_maxpool: true
+    pool_net:
+      pool_type: ch-wise-att-mean+stddev
+      inner_feats: 128
+    embed_dim: 192
+    cos_scale: 30.0
+    margin: 0.2
+    margin_warmup_epochs: 5.0
+    dropout_rate: 0.1
+    norm_before: false
+    hid_act: swish
+    se_r: 256
+trainer:
+  optim: 
+    opt_type: adam
+    lr: 0.01
+    amsgrad: true
+    beta1: 0.9
+    beta2: 0.99
+    weight_decay: 2.0e-05
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 40000
+    hold_steps: 65000
+    min_lr: 1.0e-05
+    warmup_steps: 15000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 25
+  eff_batch_size: 256
diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml
new file mode 100644
index 00000000..0923a608
--- /dev/null
+++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml
@@ -0,0 +1,69 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 4.0
+      min_chunk_length: 4.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 30.0
+    margin: 0.3
+    margin_warmup_epochs: 0
+    intertop_margin: 0.1
+    override_dropouts: true
+    dropout_rate: 0.0
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 1e-3
+    momentum: 0.9
+    weight_decay: 2e-5
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 32000
+    hold_steps: 16000
+    min_lr: 1.0e-6
+    warmup_steps: 8000
+    update_lr_on_opt_step: true
+  grad_clip: 250
+  use_amp: true
+  log_interval: 1000
+  epochs: 15
+  eff_batch_size: 256
+  swa_start: 10
+  swa_lr: 1e-4
+  swa_anneal_epochs: 2
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
new file mode 100644
index 00000000..56d18bd0
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh
@@ -0,0 +1,44 @@
+# Channel-freq-wise-SE-ResNet34
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_cfwseresnet34.v3.0
+
+nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh
new file mode 100644
index 00000000..68849f78
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh
@@ -0,0 +1,45 @@
+# Channel-wise ResNet34
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_cwseresnet34.v3.0
+
+nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
+
+
+nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
new file mode 100644
index 00000000..f962c2b3
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh
@@ -0,0 +1,44 @@
+# Freq-wise-SE ResNet34
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_fwseresnet34.v3.0
+
+nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=false #true
+do_qmf=false #true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh
new file mode 100644
index 00000000..6ea334b4
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh
@@ -0,0 +1,44 @@
+# IdRnd ResNet100
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_idrnd_resnet100.v3.0
+
+nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0029.pth
+
+nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
new file mode 100644
index 00000000..bb5d990c
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh
@@ -0,0 +1,44 @@
+# ResNet34
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_resnet34.v3.0
+
+nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
new file mode 100644
index 00000000..2528d13f
--- /dev/null
+++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh
@@ -0,0 +1,44 @@
+# TSE-ResNet34
+
+# acoustic features
+feat_config=conf/fbank80_stmn_16k.yaml
+feat_type=fbank80_stmn
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+nnet_type=resnet
+nnet_name=${feat_type}_tseresnet34.v3.0
+
+nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml
+nnet_s1_name=$nnet_name.s1
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0025.pth
+
+nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth
+
+# back-end
+do_plda=false
+do_snorm=false #true
+do_qmf=false #true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh
new file mode 100755
index 00000000..040f458b
--- /dev/null
+++ b/egs/voxceleb/v2.1/cmd.sh
@@ -0,0 +1,28 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+if [ "$(hostname -d)" == "cm.gemini" ];then
+    #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
+    export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G"
+    #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G"
+    export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
+    # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+else
+    export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" 
+    export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V"
+    export cuda_eval_cmd="$train_cmd"
+fi
+
+
+
diff --git a/egs/voxceleb/v2.1/conf/clsp.conf b/egs/voxceleb/v2.1/conf/clsp.conf
new file mode 100644
index 00000000..4ed38246
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/clsp.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*'
+option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf
new file mode 100644
index 00000000..a7a2ce40
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]*
diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_long.conf b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf
new file mode 100644
index 00000000..b31c167c
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf
@@ -0,0 +1,13 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]*
+
+
diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf
new file mode 100644
index 00000000..ba6d9e56
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx 
diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_short.conf b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf
new file mode 100644
index 00000000..81de5cb7
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]*
diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf
new file mode 100644
index 00000000..69326b82
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100
diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
new file mode 100644
index 00000000..4fdf8068
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
@@ -0,0 +1,35 @@
+reverb_aug:
+  reverb_prob: 0.45
+  max_reverb_context: 0.5
+  rir_types: 
+    smallroom:
+      weight: 1
+      rir_path: scp:data/rirs_smallroom/rirs.scp
+      rir_norm: max
+    mediumroom:
+      weight: 1
+      rir_path: scp:data/rirs_mediumroom/rirs.scp
+      rir_norm: max
+    realroom:
+      weight: 1
+      rir_path: scp:data/rirs_real/rirs.scp
+      rir_norm: max
+noise_aug:
+  noise_prob: 0.7
+  noise_types: 
+    noise:
+      weight: 1
+      noise_path: data/musan_noise_proc_audio/wav.scp
+      min_snr: 0
+      max_snr: 18
+    music:
+      weight: 1
+      noise_path: data/musan_music_proc_audio/wav.scp
+      min_snr: 3
+      max_snr: 18
+    babble:
+      weight: 1
+      noise_path: data/musan_speech_babble/wav.scp
+      min_snr: 3
+      max_snr: 18
+
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..ad991124
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..0b1d0454
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..254ff796
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..52be6db5
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..bd3e7f86
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..69a8322b
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..3443591a
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..abe5da6e
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..7287188c
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 64
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..3443591a
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..69a8322b
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..5e1260ad
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
new file mode 100644
index 00000000..2addaa1e
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
@@ -0,0 +1,59 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 128
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model: wavlmlarge_ecapatdnn512x3_v2.0.yaml
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.4
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 7500
+    hold_steps: 2600
+    #min_lr: 4e-4
+    min_lr: 1e-6
+    warmup_steps: 2600
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 35
+  eff_batch_size: 1024
+  train_mode: hf-feats-frozen-nograd
+ 
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..69a8322b
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,63 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: class_id
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..5e1260ad
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,73 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - class_id
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - class_id
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: class_id
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/vad_16k.yaml b/egs/voxceleb/v2.1/conf/vad_16k.yaml
new file mode 100644
index 00000000..5fb0111c
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/vad_16k.yaml
@@ -0,0 +1,8 @@
+sample_frequency: 16000
+frame_shift: 10
+frame_length: 25 
+snip_edges: false
+vad_energy_threshold: 5.5
+vad_energy_mean_scale: 0.5
+vad_proportion_threshold: 0.12
+vad_frames_context: 2
diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..c3466259
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+  drop_layers_gt: 12
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml
new file mode 100644
index 00000000..d9c9b782
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+xvector:
+  resnet_enc:
+    in_feats: 1024
+    in_conv_channels: 1024
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 1024
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 3072
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..dc3737e3
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: facebook/wav2vec2-xls-r-300m
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..d7e3388f
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-base-plus
+  drop_layers_gt: 9
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..b2430d97
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-base-plus
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..5025f047
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,45 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-large
+  drop_layers_gt: 12
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
new file mode 100644
index 00000000..0a6303f5
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml
@@ -0,0 +1,44 @@
+hf_feats:
+  pretrained_model_path: microsoft/wavlm-large
+xvector:
+  resnet_enc:
+    in_feats: 765
+    in_conv_channels: 512
+    in_kernel_size: 5
+    in_stride: 1
+    resb_type: seres2bn
+    resb_repeats:
+      - 1
+      - 1
+      - 1
+    resb_channels:
+      - 512
+    resb_kernel_sizes:
+      - 3
+    resb_dilations:
+      - 2
+      - 3
+      - 4
+    resb_strides:
+      - 1
+    res2net_width_factor: 1
+    res2net_scale: 8
+    se_r: 4
+    multilayer: true
+    multilayer_concat: true
+    endpoint_channels: 1536
+    norm_before: false
+    hid_act: swish
+  pool_net:
+    pool_type: ch-wise-att-mean+stddev
+    inner_feats: 128
+  embed_dim: 192
+  cos_scale: 32.0
+  margin: 0.2
+  margin_warmup_epochs: 2
+  intertop_margin: 0.1
+  dropout_rate: 0.0
+  norm_before: false
+  hid_act: swish
+feat_fusion_method: weighted-avg
+feat_fusion_start: 2
diff --git a/egs/voxceleb/v2.1/datapath.sh b/egs/voxceleb/v2.1/datapath.sh
new file mode 100644
index 00000000..a7eb575c
--- /dev/null
+++ b/egs/voxceleb/v2.1/datapath.sh
@@ -0,0 +1,23 @@
+# Copyright
+#            2018   Johns Hopkins University (Author: Jesus Villalba)
+#
+# Paths to the databases used in the experiment
+
+
+if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
+  # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1
+  voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2
+  voxceleb2_root=/export/corpora5/VoxCeleb2
+  musan_root=/export/corpora5/JHU/musan
+elif [ "$(hostname --domain)" == "cm.gemini" ];then
+  # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1
+  voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2
+  voxceleb2_root=/expscratch/dgromero/corpora-open/vox2
+  voxsrc22_root=/exp/jvillalba/corpora/voxsrc22
+  musan_root=/expscratch/dgromero/corpora-open/musan
+else
+  echo "Put your database paths here"
+  exit 1
+fi
+
+
diff --git a/egs/voxceleb/v2.1/default_config.sh b/egs/voxceleb/v2.1/default_config.sh
new file mode 120000
index 00000000..f2d8812d
--- /dev/null
+++ b/egs/voxceleb/v2.1/default_config.sh
@@ -0,0 +1 @@
+global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..67a4665e
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# Wav2Vec2 Multilingual 300M params layers 2-12
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m12l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh
new file mode 100644
index 00000000..b4130fad
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh
@@ -0,0 +1,54 @@
+# Wav2Vec2 Multilingual 300M params
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..80ee785b
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# Wav2Vec2 Multilingual 300M params
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..c2b30f68
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmbaseplus9l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..373535c2
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmbaseplus
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..530096cc
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmlarge12l
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..1b276bcd
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,54 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wavlmlarge
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wavlm2resnet1d
+
+nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/egs/voxceleb/v2.1/hyp_utils b/egs/voxceleb/v2.1/hyp_utils
new file mode 120000
index 00000000..f6d1eb7a
--- /dev/null
+++ b/egs/voxceleb/v2.1/hyp_utils
@@ -0,0 +1 @@
+../../../hyp_utils
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/path.sh b/egs/voxceleb/v2.1/path.sh
new file mode 100755
index 00000000..6994fdab
--- /dev/null
+++ b/egs/voxceleb/v2.1/path.sh
@@ -0,0 +1,5 @@
+
+export HYP_ROOT=$(readlink -f `pwd -P`/../../..)
+export TOOLS_ROOT=$HYP_ROOT/tools
+
+. $TOOLS_ROOT/path.sh
diff --git a/egs/voxceleb/v2.1/run_001_prepare_data.sh b/egs/voxceleb/v2.1/run_001_prepare_data.sh
new file mode 100755
index 00000000..563d3c2d
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_001_prepare_data.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright
+#                2018   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. datapath.sh 
+. $config_file
+
+if [ $stage -le 1 ];then
+  # Prepare the VoxCeleb2 dataset for training.
+  hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \
+			--cat-videos --use-kaldi-ids \
+			--output-dir data/voxceleb2cat_train
+fi
+
+if [ $stage -le 2 ];then
+  # prepare voxceleb1 for test
+  hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \
+			--use-kaldi-ids \
+			--output-dir data/voxceleb1_test
+fi
+
+if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then
+  hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \
+			--vox1-corpus-dir $voxceleb1_root \
+			--output-dir data/voxsrc22_dev
+fi
+
+# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
+  #   hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \
+  # 		  --vox1-corpus-dir $voxceleb1_root \
+  # 		  --output-dir data/voxsrc22_test
+# fi
+
+if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then
+  # split vox2 into 2 parts, for cohort and qmf training
+  hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train
+fi
diff --git a/egs/voxceleb/v2.1/run_002_compute_evad.sh b/egs/voxceleb/v2.1/run_002_compute_evad.sh
new file mode 100755
index 00000000..acccace3
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_002_compute_evad.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright
+#                2018   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+nodes=fs01
+vad_dir=`pwd`/exp/vad_e
+vad_config=conf/vad_16k.yaml
+nj=40
+
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ -z "$vad_config" ];then
+  echo "We are not using VAD in this configuration"
+  exit 0
+fi
+
+if [ "$do_voxsrc22" == "true" ];then
+  extra_data="voxsrc22_dev"
+fi
+
+
+if [ $stage -le 1 ]; then
+  # Prepare to distribute data over multiple machines
+  # This only does something at CLSP grid
+  for name in voxceleb2cat_train voxceleb1_test $extra_data
+  do
+    hyp_utils/create_data_split_dirs.sh \
+      $vad_dir/$name \
+      $USER/hyp-data/voxceleb/v1.2/vad $nodes
+  done
+fi
+
+#Train datasets
+if [ $stage -le 2 ];then
+  for name in voxceleb2cat_train voxceleb1_test $extra_data
+  do
+    # This creates links to distribute data in CLSP grid
+    # If you are not at CLSP grid, it does nothing and can be deleted
+    hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj
+    echo "compute vad for $name"
+    $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-compute-energy-vad --cfg $vad_config \
+	       --recordings-file data/$name/recordings.csv \
+	       --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \
+	       --part-idx JOB --num-parts $nj || exit 1
+
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $vad_dir/$name/vad.csv --num-tables $nj
+    hyperion-dataset add_features \
+		     --dataset data/$name \
+		     --features-name vad \
+		     --features-file $vad_dir/$name/vad.csv
+  done
+fi
+
+
diff --git a/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh
new file mode 100755
index 00000000..73c7ed82
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nj=10
+config_file=default_config.sh
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+# We prepare the noise files and RIR for online speech augmentation
+if [ $stage -le 1 ]; then
+  for name in noise music speech
+  do
+    hyperion-prepare-data musan \
+			  --corpus-dir $musan_root \
+			  --subset $name \
+			  --output-dir data/musan_$name
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # # Prepare to distribute data over multiple machines
+  # # This only does something at CLSP grid
+  # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes
+
+  for name in musan_noise musan_music
+  do
+    input_data_dir=data/$name
+    output_data_dir=data/${name}_proc_audio
+    output_dir=exp/proc_audio/$name
+    $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-preprocess-audio-files \
+	       --audio-format flac  \
+	       --part-idx JOB --num-parts $nj \
+	       --recordings-file $input_data_dir/recordings.csv \
+	       --output-path $output_dir \
+	       --output-recordings-file $output_dir/recordings.JOB.csv
+    
+    hyperion-tables cat \
+		    --table-type recordings \
+		    --output-file $output_dir/recordings.csv --num-tables $nj
+    hyperion-dataset set_recordings \
+		     --dataset $input_data_dir \
+		     --recordings-file $output_dir/recordings.csv \
+		     --output-dataset $output_data_dir
+    
+    
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  # Create Babble noise from MUSAN speech files
+  for name in musan_speech
+  do
+    input_data_dir=data/$name
+    output_data_dir=data/${name}_babble
+    output_dir=exp/proc_audio/${name}_babble
+    $train_cmd $output_dir/log/make_babble_noise_${name}.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-make-babble-noise-audio-files \
+	       --audio-format flac \
+	       --min-spks 3 --max-spks 10 --num-reuses 5 \
+	       --recordings-file $input_data_dir/recordings.csv \
+	       --output-path $output_dir \
+	       --output-recordings-file $output_data_dir/recordings.csv
+    hyperion-dataset make_from_recordings \
+		     --dataset $output_data_dir \
+		     --recordings-file $output_data_dir/recordings.csv
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom
+  hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real
+  for rirs in rirs_smallroom rirs_mediumroom rirs_real
+  do
+    output_dir=exp/rirs/$rirs
+    data_dir=data/$rirs
+    $train_cmd $output_dir/log/pack_rirs_${name}.log \
+	       hyp_utils/conda_env.sh \
+	       hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \
+	       --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1;
+    hyperion-dataset add_features --dataset $data_dir \
+		     --features-name rirs --features-file $output_dir/rirs.csv
+
+  done
+fi
+
diff --git a/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh
new file mode 100755
index 00000000..4e0c5b19
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+nodes=b1
+nj=40
+stage=1
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ $stage -le 1 ]; then
+  # Prepare to distribute data over multiple machines
+  # This only does something at CLSP grid
+  hyp_utils/create_data_split_dirs.sh \
+    exp/xvector_audios/$nnet_data \
+    $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes
+fi
+
+if [ $stage -le 2 ];then
+  output_dir=exp/proc_audio/$nnet_data
+  # This creates links to distribute data in CLSP grid
+  # If you are not at CLSP grid, it does nothing and can be deleted
+  hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac
+  if [ -n "$vad_config" ];then
+    vad_args="--vad csv:data/$nnet_data/vad.csv"
+    update_durs="--update-seg-durs"
+  fi
+  
+  $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \
+	     hyp_utils/conda_env.sh \
+	     hyperion-preprocess-audio-files \
+	     --audio-format flac --remove-dc-offset $vad_args \
+	     --part-idx JOB --num-parts $nj \
+	     --recordings-file data/$nnet_data/recordings.csv \
+	     --output-path $output_dir \
+	     --output-recordings-file $output_dir/recordings.JOB.csv
+
+  hyperion-tables cat \
+		  --table-type recordings \
+		  --output-file $output_dir/recordings.csv --num-tables $nj
+
+  hyperion-dataset set_recordings $update_durs \
+		   --dataset data/$nnet_data \
+		   --recordings-file $output_dir/recordings.csv \
+		   --output-dataset data/${nnet_data}_proc_audio \
+		   --remove-features vad
+fi
+
+if [ $stage -le 3 ];then
+  hyperion-dataset remove_short_segments \
+		   --dataset data/${nnet_data}_proc_audio \
+		   --output-dataset data/${nnet_data}_filtered \
+		   --length-name duration --min-length 2.0
+
+  hyperion-dataset remove_classes_few_segments \
+		   --dataset data/${nnet_data}_filtered \
+		   --class-name speaker --min-segs 4
+fi
+
+if [ $stage -le 4 ];then
+  hyperion-dataset split_train_val \
+		   --dataset data/${nnet_data}_filtered \
+		   --val-prob 0.03 \
+		   --joint-classes speaker --min-train-samples 1 \
+		   --seed 1123581321 \
+		   --train-dataset data/${nnet_data}_xvector_train \
+		   --val-dataset data/${nnet_data}_xvector_val 
+fi
+
diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh
new file mode 100755
index 00000000..2479d565
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright
+#                2019   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+ngpu=4
+config_file=default_config.sh
+interactive=false
+num_workers=""
+use_tb=false
+use_wandb=false
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh
+
+train_data_dir=data/${nnet_data}_xvector_train
+val_data_dir=data/${nnet_data}_xvector_val
+
+#add extra args from the command line arguments
+if [ -n "$num_workers" ];then
+    extra_args="--data.train.data_loader.num-workers $num_workers"
+fi
+if [ "$use_tb" == "true" ];then
+    extra_args="$extra_args --trainer.use-tensorboard"
+fi
+if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)"
+fi
+
+if [ "$interactive" == "true" ];then
+    export cuda_cmd=run.pl
+fi
+
+# Network Training
+if [ $stage -le 1 ]; then
+  
+  mkdir -p $nnet_s1_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s1_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
+    --data.train.dataset.segments-file $train_data_dir/segments.csv \
+    --data.train.dataset.class-files $train_data_dir/speaker.csv \
+    --data.val.dataset.recordings-file $val_data_dir/recordings.csv \
+    --data.val.dataset.segments-file $val_data_dir/segments.csv \
+    --trainer.exp-path $nnet_s1_dir \
+    --num-gpus $ngpu \
+  
+fi
+
+
+# Large Margin Fine-tuning
+if [ $stage -le 2 ]; then
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
+  fi
+  mkdir -p $nnet_s2_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s2_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
+    --data.train.dataset.segments-file $train_data_dir/segments.csv \
+    --data.train.dataset.class-files $train_data_dir/speaker.csv \
+    --data.val.dataset.recordings-file $val_data_dir/recordings.csv \
+    --data.val.dataset.segments-file $val_data_dir/segments.csv \
+    --in-model-file $nnet_s1 \
+    --trainer.exp-path $nnet_s2_dir \
+    --num-gpus $ngpu \
+  
+fi
diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
new file mode 100755
index 00000000..0dc58048
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright
+#                2020   Johns Hopkins University (Author: Jesus Villalba)
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nnet_stage=2
+config_file=default_config.sh
+use_gpu=false
+xvec_chunk_length=120.0
+. parse_options.sh || exit 1;
+. $config_file
+
+if [ "$use_gpu" == "true" ];then
+  xvec_args="--use-gpu --chunk-length $xvec_chunk_length"
+  xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G"
+  num_gpus=1
+else
+  xvec_cmd="$train_cmd --mem 12G"
+  num_gpus=0
+fi
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+elif [ $nnet_stage -eq 4 ];then
+  nnet=$nnet_s4
+  nnet_name=$nnet_s4_name
+elif [ $nnet_stage -eq 5 ];then
+  nnet=$nnet_s5
+  nnet_name=$nnet_s5_name
+elif [ $nnet_stage -eq 6 ];then
+  nnet=$nnet_s6
+  nnet_name=$nnet_s6_name
+fi
+
+xvector_dir=exp/xvectors/$nnet_name
+
+if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then
+  # Extract xvectors for training LDA/PLDA
+  nj=100
+  for name in voxceleb2cat_train
+  do
+    if [ -n "$vad_config" ];then
+      vad_args="--vad csv:data/$name/vad.csv"
+    fi
+    output_dir=$xvector_dir/$name
+    echo "Extracting x-vectors for $name"
+    $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
+	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
+	      --part-idx JOB --num-parts $nj  \
+	      --recordings-file data/$name/recordings.csv \
+	      --random-utt-length --min-utt-length 2 --max-utt-length 30 \
+	      --model-path $nnet  \
+	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $output_dir/xvector.csv --num-tables $nj
+
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Extracts x-vectors for evaluation
+  nj=100
+  if [ "$do_voxsrc22" == "true" ];then
+    extra_data="voxsrc22_dev"
+  fi
+  for name in voxceleb1_test $extra_data
+  do
+    num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}')
+    nj=$(($num_segs < 100 ? $num_segs:100))
+    if [ -n "$vad_config" ];then
+      vad_args="--vad csv:data/$name/vad.csv"
+    fi
+    output_dir=$xvector_dir/$name
+    echo "Extracting x-vectors for $name"
+    $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
+	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
+	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
+	      --part-idx JOB --num-parts $nj  \
+	      --recordings-file data/$name/recordings.csv \
+	      --model-path $nnet  \
+	      --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv
+    hyperion-tables cat \
+		    --table-type features \
+		    --output-file $output_dir/xvector.csv --num-tables $nj
+
+  done
+fi
+
+
diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh
new file mode 100755
index 00000000..53621488
--- /dev/null
+++ b/egs/voxceleb/v2.1/run_007_eval_be.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+# Copyright       2018   Johns Hopkins University (Author: Jesus Villalba)
+#                
+# Apache 2.0.
+#
+. ./cmd.sh
+. ./path.sh
+set -e
+
+stage=1
+nnet_stage=2
+config_file=default_config.sh
+
+. parse_options.sh || exit 1;
+. $config_file
+. datapath.sh 
+
+if [ $nnet_stage -eq 1 ];then
+  nnet=$nnet_s1
+  nnet_name=$nnet_s1_name
+elif [ $nnet_stage -eq 2 ];then
+  nnet=$nnet_s2
+  nnet_name=$nnet_s2_name
+elif [ $nnet_stage -eq 3 ];then
+  nnet=$nnet_s3
+  nnet_name=$nnet_s3_name
+elif [ $nnet_stage -eq 4 ];then
+  nnet=$nnet_s4
+  nnet_name=$nnet_s4_name
+elif [ $nnet_stage -eq 5 ];then
+  nnet=$nnet_s5
+  nnet_name=$nnet_s5_name
+elif [ $nnet_stage -eq 6 ];then
+  nnet=$nnet_s6
+  nnet_name=$nnet_s6_name
+fi
+
+plda_label=${plda_type}y${plda_y_dim}_v1
+be_name=lda${lda_dim}_${plda_label}_${plda_data}
+
+xvector_dir=exp/xvectors/$nnet_name
+be_dir=exp/be/$nnet_name/$be_name
+score_dir=exp/scores/$nnet_name
+score_plda_dir=$score_dir/${be_name}/plda
+score_cosine_dir=$score_dir/cosine
+score_cosine_snorm_dir=$score_dir/cosine_snorm
+score_cosine_qmf_dir=$score_dir/cosine_qmf
+
+if [ $stage -le 3 ];then
+
+  echo "Eval Voxceleb 1 with Cosine scoring"
+  num_parts=8
+  for((i=1;i<=$num_parts;i++));
+  do
+    for((j=1;j<=$num_parts;j++));
+    do
+      $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \
+		 hyp_utils/conda_env.sh \
+		 hyperion-eval-cosine-scoring-backend \
+		 --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		 --ndx-file data/voxceleb1_test/trials.csv \
+		 --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		 --score-file $score_cosine_dir/voxceleb1_scores.csv \
+		 --enroll-part-idx $i --num-enroll-parts $num_parts \
+		 --test-part-idx $j --num-test-parts $num_parts &
+    done
+  done
+  wait
+  hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \
+			--num-enroll-parts $num_parts --num-test-parts $num_parts
+
+  $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \
+	     hyperion-eval-verification-metrics \
+	     --score-files $score_cosine_dir/voxceleb1_scores.csv \
+	     --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+	     --score-names voxceleb1 \
+	     --key-names O E H \
+	     --sparse \
+	     --output-file $score_cosine_dir/voxceleb1_results.csv
+
+  cat $score_cosine_dir/voxceleb1_results.csv
+fi
+
+if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then
+  echo "Eval voxsrc2 with Cosine scoring"
+  $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \
+	     hyp_utils/conda_env.sh \
+	     hyperion-eval-cosine-scoring-backend \
+	     --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+	     --ndx-file data/voxsrc22_dev/trials.csv \
+	     --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+	     --score-file $score_cosine_dir/voxsrc22_dev_scores.csv
+
+  # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \
+    # 	     hyp_utils/conda_env.sh \
+    # 	     hyperion-eval-cosine-scoring-backend \
+    # 	     --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \
+    # 	     --ndx-file data/voxsrc22_eval/trials.csv \
+    # 	     --enroll-map-file data/voxsrc22_eval/enrollment.csv  \
+    # 	     --score-file $score_cosine_dir/voxsrc22_eval_scores.csv
+  
+  $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \
+	     hyperion-eval-verification-metrics \
+	     --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \
+	     --key-files data/voxsrc22_dev/trials.csv \
+	     --score-names voxsrc22_dev \
+	     --key-names all \
+	     --output-file $score_cosine_dir/voxsrc22_dev_results.csv
+
+  cat $score_cosine_dir/voxsrc22_dev_results.csv
+
+fi
+
+if [ "$do_snorm" == "true" ];then
+  if [ $stage -le 5 ];then
+    echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   hyperion-eval-cosine-scoring-backend \
+		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		   --ndx-file data/voxceleb1_test/trials.csv \
+		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		   --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \
+			  --num-enroll-parts $num_parts --num-test-parts $num_parts
+    
+    $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \
+	       hyperion-eval-verification-metrics \
+	       --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \
+	       --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+	       --score-names voxceleb1 \
+	       --key-names O E H \
+	       --sparse \
+	       --output-file $score_cosine_snorm_dir/voxceleb1_results.csv
+    
+    cat $score_cosine_snorm_dir/voxceleb1_results.csv
+  fi
+
+  if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then
+    echo "Eval voxsrc2 with Cosine scoring + AS-Norm"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do    
+	$train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   hyperion-eval-cosine-scoring-backend \
+		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+		   --ndx-file data/voxsrc22_dev/trials.csv \
+		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+		   --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+	sleep 5s
+      done
+      sleep 10s
+    done
+    wait
+    hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+			  --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+    $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \
+	       hyperion-eval-verification-metrics \
+	       --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \
+	       --key-files data/voxsrc22_dev/trials.csv \
+	       --score-names voxsrc22_dev \
+	       --key-names all \
+	       --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv
+
+    cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv
+
+  fi
+
+fi
+
+if [ "$do_qmf" == "true" ];then
+  if [ $stage -le 7 ];then
+    echo "Train QMF in Vox2"
+    echo "...Calculating quality measures for Vox2"
+    num_parts=8
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
+		   --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --ndx-file data/voxceleb2cat_train_trials/trials.csv \
+		   --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv  \
+		   --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+      			  --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+    hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \
+		       --key-file data/voxceleb2cat_train_trials/trials.csv \
+		       --model-file $score_cosine_qmf_dir/qmf.h5
+    
+  fi
+
+  if [ $stage -le 8 ];then
+    echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do
+	$train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
+		   --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \
+		   --ndx-file data/voxceleb1_test/trials.csv \
+		   --enroll-map-file data/voxceleb1_test/enrollment.csv  \
+		   --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --qmf-file $score_cosine_qmf_dir/qmf.h5 \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+      done
+      sleep 5s
+    done
+    wait
+    for suffix in "" .snorm .snorm.qmf
+    do
+      (
+	hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
+			      --num-enroll-parts $num_parts --num-test-parts $num_parts
+	
+	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \
+		   hyperion-eval-verification-metrics \
+		   --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \
+		   --key-files data/voxceleb1_test/trials_{o,e,h}.csv \
+		   --score-names voxceleb1 \
+		   --key-names O E H \
+		   --sparse \
+		   --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv
+
+	echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:"
+	cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv
+      ) &
+    done
+    wait
+  fi
+  
+  if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then
+    echo "Eval voxsrc2 with Cosine scoring + QMF"
+    num_parts=16
+    for((i=1;i<=$num_parts;i++));
+    do
+      for((j=1;j<=$num_parts;j++));
+      do    
+	$train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \
+		   hyp_utils/conda_env.sh \
+		   hyperion-eval-cosine-scoring-backend-with-qmf \
+		   --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \
+		   --ndx-file data/voxsrc22_dev/trials.csv \
+		   --enroll-map-file data/voxsrc22_dev/enrollment.csv  \
+		   --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \
+		   --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \
+		   --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \
+		   --cohort-nbest 1000 --avg-cohort-by speaker \
+		   --qmf-file $score_cosine_qmf_dir/qmf.h5 \
+		   --enroll-part-idx $i --num-enroll-parts $num_parts \
+		   --test-part-idx $j --num-test-parts $num_parts &
+	sleep 5s
+      done
+      sleep 10s
+    done
+    wait
+    for suffix in "" .snorm .snorm.qmf
+    do
+      (
+	hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
+			      --num-enroll-parts $num_parts --num-test-parts $num_parts
+
+	$train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \
+		   hyperion-eval-verification-metrics \
+		   --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \
+		   --key-files data/voxsrc22_dev/trials.csv \
+		   --score-names voxsrc22_dev \
+		   --key-names all \
+		   --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv
+
+	echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:"
+	cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv
+      ) &
+    done
+    wait
+  fi
+
+fi
+
diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md
index a005b6e8..0bafe85e 100644
--- a/egs/voxceleb/v2/README.md
+++ b/egs/voxceleb/v2/README.md
@@ -26,12 +26,12 @@ Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Huber
 ## Usage
 
    - Run the run_0*.sh scripts in sequence
-   - By default it will use 
+   - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
    - For better performance use 
 ```bash
-run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
-run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true
-run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh
+run_011_train_xvector.sh --config-file global_conf/other_config.sh
+run_030_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true
+run_040_eval_be.sh --config-file global_conf/other_config.sh
 ```
 
 
@@ -155,7 +155,7 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr
 | | | | Cosine + QMF | 2.38 | 0.159 | 0.266 |
 | config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 |
 | | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 |
-| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 |
+| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 |
 | config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 |
 | | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 |
 | | | | Cosine + QMF | 1.92 | 0.117 | 0.200 |
diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh
index abcc2a2e..f2d8812d 120000
--- a/egs/voxceleb/v2/default_config.sh
+++ b/egs/voxceleb/v2/default_config.sh
@@ -1 +1 @@
-global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh
\ No newline at end of file
+global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh
\ No newline at end of file

From ed35173f534f98cb85b609642226b99d17163ddb Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Mon, 11 Sep 2023 12:12:49 -0400
Subject: [PATCH 79/89]  vox/v2.1 recipe done, not tested

---
 egs/voxceleb/v2.1/run_005_train_xvector.sh    | 27 ++++++++++++++++---
 egs/voxceleb/v2.1/run_006_extract_xvectors.sh |  9 ++++---
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh
index 2479d565..eb1c591e 100755
--- a/egs/voxceleb/v2.1/run_005_train_xvector.sh
+++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh
@@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s1_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
+    hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \
     --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
     --data.train.dataset.segments-file $train_data_dir/segments.csv \
     --data.train.dataset.class-files $train_data_dir/speaker.csv \
@@ -56,7 +56,7 @@ if [ $stage -le 1 ]; then
 fi
 
 
-# Large Margin Fine-tuning
+# Finetune full model
 if [ $stage -le 2 ]; then
   if [ "$use_wandb" == "true" ];then
     extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
   $cuda_cmd \
     --gpu $ngpu $nnet_s2_dir/log/train.log \
     hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
-    hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
+    hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
     --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
     --data.train.dataset.segments-file $train_data_dir/segments.csv \
     --data.train.dataset.class-files $train_data_dir/speaker.csv \
@@ -76,3 +76,24 @@ if [ $stage -le 2 ]; then
     --num-gpus $ngpu \
   
 fi
+
+# Finetune full model
+if [ $stage -le 3 ]; then
+  if [ "$use_wandb" == "true" ];then
+    extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
+  fi
+  mkdir -p $nnet_s3_dir/log
+  $cuda_cmd \
+    --gpu $ngpu $nnet_s3_dir/log/train.log \
+    hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
+    hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
+    --data.train.dataset.recordings-file $train_data_dir/recordings.csv \
+    --data.train.dataset.segments-file $train_data_dir/segments.csv \
+    --data.train.dataset.class-files $train_data_dir/speaker.csv \
+    --data.val.dataset.recordings-file $val_data_dir/recordings.csv \
+    --data.val.dataset.segments-file $val_data_dir/segments.csv \
+    --in-model-file $nnet_s2 \
+    --trainer.exp-path $nnet_s3_dir \
+    --num-gpus $ngpu \
+  
+fi
diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
index 0dc58048..2cfe27fe 100755
--- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
+++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
@@ -8,15 +8,16 @@
 set -e
 
 stage=1
-nnet_stage=2
+nnet_stage=3
 config_file=default_config.sh
 use_gpu=false
+hf_chunk_length=120.0 #seconds
 xvec_chunk_length=120.0
 . parse_options.sh || exit 1;
 . $config_file
 
 if [ "$use_gpu" == "true" ];then
-  xvec_args="--use-gpu --chunk-length $xvec_chunk_length"
+  xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length"
   xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G"
   num_gpus=1
 else
@@ -58,7 +59,7 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm
     echo "Extracting x-vectors for $name"
     $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
 	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
-	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
+	      hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \
 	      --part-idx JOB --num-parts $nj  \
 	      --recordings-file data/$name/recordings.csv \
 	      --random-utt-length --min-utt-length 2 --max-utt-length 30 \
@@ -88,7 +89,7 @@ if [ $stage -le 2 ]; then
     echo "Extracting x-vectors for $name"
     $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \
 	      hyp_utils/conda_env.sh --num-gpus $num_gpus \
-	      hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \
+	      hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \
 	      --part-idx JOB --num-parts $nj  \
 	      --recordings-file data/$name/recordings.csv \
 	      --model-path $nnet  \

From 8760d055520609a57bc69ac9fc05ef159e9f336a Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Tue, 12 Sep 2023 14:06:02 -0400
Subject: [PATCH 80/89] implemented lora in w2v2, not tested

---
 hyperion/io/bin_vad_reader.py                 |   4 +-
 hyperion/np/augment/noise_augment.py          |   2 +-
 hyperion/torch/layers/__init__.py             |  13 +-
 hyperion/torch/layers/lora.py                 |  80 +++++
 .../models/wav2xvectors/hf_wav2xvector.py     |  26 +-
 hyperion/torch/tpm/hf/hf_wav2vec2.py          |  18 +-
 hyperion/torch/tpm/hf/hf_wav2vec_base.py      | 320 ++++++++++++++----
 hyperion/utils/dataset.py                     |  68 +++-
 requirements.txt                              |   4 +-
 9 files changed, 425 insertions(+), 110 deletions(-)
 create mode 100644 hyperion/torch/layers/lora.py

diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py
index 82e2a0c5..8ce91d15 100644
--- a/hyperion/io/bin_vad_reader.py
+++ b/hyperion/io/bin_vad_reader.py
@@ -59,7 +59,7 @@ def read(
         vad = self.r.read(keys)
         output_vad = []
         for i in range(len(keys)):
-            vad_i = vad[i].astype(np.bool, copy=False)
+            vad_i = vad[i].astype(bool, copy=False)
             offset_i = offset[i] if offset_is_list else offset
             num_frames_i = num_frames[i] if num_frames_is_list else num_frames
             vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i)
@@ -77,7 +77,7 @@ def read_timestamps(self, keys, merge_tol=0.001):
         vad = self.r.read(keys)
         ts = []
         for i in range(len(keys)):
-            vad_i = vad[i].astype(np.bool, copy=False)
+            vad_i = vad[i].astype(bool, copy=False)
             ts_i = bin_vad_to_timestamps(
                 vad_i,
                 self.frame_length / 1000,
diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py
index 1cc1a0be..92bd57dd 100644
--- a/hyperion/np/augment/noise_augment.py
+++ b/hyperion/np/augment/noise_augment.py
@@ -55,7 +55,7 @@ def __init__(
     @staticmethod
     def _power(x):
         """Computes power of x in dB."""
-        return 10 * np.log10((x ** 2).sum())
+        return 10 * np.log10((x**2).sum() + 1e-10)
 
     @staticmethod
     def snr(x, n):
diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py
index 6b508b0e..bea52c95 100644
--- a/hyperion/torch/layers/__init__.py
+++ b/hyperion/torch/layers/__init__.py
@@ -4,20 +4,23 @@
 """
 
 from .activation_factory import ActivationFactory
-from .attention import (LocalScaledDotProdAttRelPosEncV1,
-                        LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1,
-                        ScaledDotProdAttV1)
+from .attention import (
+    LocalScaledDotProdAttRelPosEncV1,
+    LocalScaledDotProdAttV1,
+    ScaledDotProdAttRelPosEncV1,
+    ScaledDotProdAttV1,
+)
 from .audio_feats import *
 from .audio_feats_factory import AudioFeatsFactory
 from .calibrators import LinBinCalibrator
 from .dropout import DropConnect1d, DropConnect2d, Dropout1d
 from .global_pool import *
 from .interpolate import Interpolate
+from .lora import LoRAFactory
 from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput
 from .mvn import MeanVarianceNorm
 from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory
 from .pool_factory import GlobalPool1dFactory
-from .pos_encoder import (ConvPosEncoder, NoPosEncoder, PosEncoder,
-                          RelPosEncoder)
+from .pos_encoder import ConvPosEncoder, NoPosEncoder, PosEncoder, RelPosEncoder
 from .spec_augment import AxisMasker, SpecAugment, SpecWarper
 from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d
diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py
new file mode 100644
index 00000000..1436caf5
--- /dev/null
+++ b/hyperion/torch/layers/lora.py
@@ -0,0 +1,80 @@
+"""
+ Copyright 2023 Johns Hopkins University  (Author: Jesus Villalba)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+
+from typing import Union
+
+import loralib as lora
+import torch.nn as nn
+from loralib import *
+
+
+class LoRAFactory:
+    def create_from_pretrained(
+        layer: Union[nn.Embedding, nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d],
+        r: int = 8,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        merge_weights: bool = True,
+    ):
+        if isinstance(layer, nn.Embedding):
+            lora_layer = lora.Embedding(
+                layer.num_embeddings,
+                layer.embedding_dim,
+                padding_idx=layer.padding_idx,
+                max_norm=layer.max_norm,
+                norm_type=layer.norm_type,
+                scale_grad_by_freq=layer.scale_grad_by_freq,
+                sparse=layer.sparse,
+                r=r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                merge_weights=merge_weights,
+            )
+            lora_layer.weight.data = layer.weight.data
+
+        elif isinstance(layer, nn.Linear):
+            bias = layer.bias is not None
+            lora_layer = lora.Linear(
+                layer.in_features,
+                layer.out_features,
+                bias=bias,
+                r=r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                merge_weights=merge_weights,
+            )
+            lora_layer.weight.data = layer.weight.data
+            if bias:
+                lora_layer.bias.data = layer.bias.data
+
+        elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            if isinstance(layer, nn.Conv1d):
+                lora_class = lora.Conv1d
+            elif isinstance(layer, nn.Conv2d):
+                lora_class = lora.Conv2d
+            elif isinstance(layer, nn.Conv3d):
+                lora_class = lora.Conv3d
+
+            bias = layer.bias is not None
+            lora_layer = lora_class(
+                layer.in_channels,
+                layer.out_channels,
+                layer.kernel_size,
+                stride=layer.stride,
+                padding=layer.padding,
+                dilation=layer.dilation,
+                groups=layer.groups,
+                bias=bias,
+                padding_mode=layer.padding_mode,
+                r=r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                merge_weights=merge_weights,
+            )
+            lora_layer.weight.data = layer.weight.data
+            if bias:
+                lora_layer.bias.data = layer.bias.data
+
+        return lora_layer
diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
index 24ab5bbb..925f1172 100644
--- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
+++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py
@@ -5,10 +5,9 @@
 import contextlib
 import logging
 
-from jsonargparse import ActionParser, ArgumentParser
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ArgumentParser
 
 from ...torch_model import TorchModel
 from ...utils import remove_silence
@@ -29,7 +28,6 @@ class HFWav2XVector(TorchModel):
     def __init__(
         self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg"
     ):
-
         super().__init__()
         self.hf_feats = hf_feats
         self.xvector = xvector
@@ -222,7 +220,6 @@ def extract_embed(
         embed_layer=None,
         detach_chunks=False,
     ):
-
         if vad_samples is not None:
             x, x_lengths = remove_silence(x, vad_samples, x_lengths)
 
@@ -256,6 +253,9 @@ def freeze_hf_feats(self):
     def freeze_hf_feature_encoder(self):
         self.hf_feats.freeze_feature_encoder()
 
+    def freeze_hf_except_lora(self, bias=None):
+        self.hf_feats.freeze_except_lora(bias)
+
     def has_param_groups(self):
         return self.hf_feats.has_param_groups()
 
@@ -296,6 +296,15 @@ def set_train_mode(self, mode):
         elif mode == "hf-feat-extractor-frozen":
             self.unfreeze()
             self.freeze_hf_feature_encoder()
+        elif mode == "hf-lora":
+            self.unfreeze()
+            self.freeze_hf_except_lora()
+        elif mode == "hf-all-bias-lora":
+            self.unfreeze()
+            self.freeze_hf_except_lora(bias="all")
+        elif mode == "hf-lora-with-bias":
+            self.unfreeze()
+            self.freeze_hf_except_lora(bias="lora_only")
         else:
             raise ValueError(f"invalid train_mode={mode}")
 
@@ -310,7 +319,6 @@ def set_train_mode(self, mode):
         self._train_mode = mode
 
     def _train(self, train_mode: str):
-
         if train_mode in ["full", "frozen"]:
             super()._train(train_mode)
         elif train_mode == "ft-embed-affine":
@@ -322,6 +330,9 @@ def _train(self, train_mode: str):
             "ft-xvector-nograd",
             "hf-feats-frozen-nograd",
             "hf-feat-extractor-frozen",
+            "hf-lora",
+            "hf-all-bias-lora",
+            "hf-lora-with-bias",
         ]:
             self.hf_feats.train()
             self.xvector._train("full")
@@ -339,6 +350,9 @@ def valid_train_modes():
             "ft-xvector-nograd",
             "hf-feats-frozen-nograd",
             "hf-feat-extractor-frozen",
+            "hf-lora",
+            "hf-all-bias-lora",
+            "hf-lora-with-bias",
         ]
 
     @staticmethod
@@ -353,7 +367,6 @@ def filter_args(**kwargs):
         return args
 
     def get_config(self):
-
         hf_cfg = self.hf_feats.get_config()
         xvec_cfg = self.xvector.get_config()
         del hf_cfg["class_name"]
@@ -375,7 +388,6 @@ def change_config(self, hf_feats, xvector):
 
     @staticmethod
     def add_class_args(parser, prefix=None, skip=set()):
-
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index 26da7beb..dd5de2fe 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -6,11 +6,10 @@
 import os
 from typing import Callable, List, Optional, Tuple, Union
 
-from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-from transformers import Wav2Vec2Config, Wav2Vec2Model
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from transformers import Wav2Vec2Config, Wav2Vec2Model
 
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
 from .hf_wav2vec_base import HFWav2VecBase
@@ -204,8 +203,13 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        use_lora: bool = False,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
     ):
-
         super().__init__(
             pretrained_model_path=pretrained_model_path,
             normalize_input=normalize_input,
@@ -223,6 +227,12 @@ def __init__(
             sample_frequency=sample_frequency,
             feat_extract_lr=feat_extract_lr,
             encoder_lr=encoder_lr,
+            use_lora=use_lora,
+            lora_components=lora_components,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_merge_weights=lora_merge_weights,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index a9c4ddef..2c8d239f 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -8,12 +8,13 @@
 from turtle import right
 from typing import List, Optional, Tuple, Union
 
-from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
 
+from ....utils.misc import filter_func_args
+from ...layers import LoRAFactory
 from ...torch_model import TorchModel
 from ...utils import scale_seq_lengths, seq_lengths_to_mask
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
@@ -55,6 +56,12 @@ class HFWav2VecBase(TorchModel):
         sample_frequency: (`int`) waveform sample frequency used to train the model.
         feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
         encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
+        use_lora: use low-rank adapters
+        lora_components: list of components where we apply LoRA, eg [Wq, Wv]
+        lora_rank: rank of LoRA
+        lora_alpha: scale for LoRA
+        lora_dropout: dropout rate for LoRA
+        lora_merge_weights: lora weights are merged with the pretrained weights at inference.
     """
 
     def __init__(
@@ -75,6 +82,12 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        use_lora: bool = False,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
     ):
         super().__init__()
         self.pretrained_model_path = pretrained_model_path
@@ -90,6 +103,12 @@ def __init__(
         self.left_encoder_context = left_encoder_context
         self.feat_extract_lr = feat_extract_lr
         self.encoder_lr = encoder_lr
+        self.use_lora = use_lora
+        self.lora_components = lora_components
+        self.lora_rank = lora_rank
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.lora_merge_weights = lora_merge_weights
 
         if pretrained_model_path is not None and not ignore_pretrained:
             rank = ddp_get_rank()
@@ -153,6 +172,16 @@ def __init__(
 
         self._feature_encoder_context = None
         self._frame_shift = None
+        self.hf_model = None
+
+        if use_lora:
+            self._make_lora_layers(
+                lora_components,
+                lora_rank,
+                lora_alpha,
+                lora_dropout,
+                lora_merge_weights,
+            )
 
     def __deepcopy__(self, memo):
         """Reimplementation of deepcopy for Hugging Face models.
@@ -225,18 +254,36 @@ def change_config(
         self,
         override_dropouts: bool,
         override_spec_augment: bool,
+        override_lora: bool,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        use_lora: bool = False,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
         **kwargs,
     ):
         if override_spec_augment:
-            logging.info("overriding speech augment")
+            logging.info(f"overriding speech augment with args={kwargs}")
             self.change_spec_augment(**kwargs)
 
         if override_dropouts:
-            logging.info("overriding hf model dropouts")
+            logging.info(f"overriding hf model dropouts with args={kwargs}")
             self.change_dropouts(**kwargs)
 
+        if override_lora:
+            logging.info("overriding LoRA config")
+            self.change_lora(
+                use_lora=use_lora,
+                lora_components=lora_components,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                lora_merge_weights=lora_merge_weights,
+            )
+
         self.feat_extract_lr = feat_extract_lr
         self.encoder_lr = encoder_lr
 
@@ -259,12 +306,109 @@ def change_spec_augment(
         self.hf_model.config.mask_feature_length = mask_feature_length
         self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks
 
+    def change_lora(
+        self,
+        use_lora: bool = False,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
+    ):
+        if not self.use_lora:
+            if use_lora:
+                self._make_lora_layers(
+                    lora_components,
+                    lora_rank,
+                    lora_alpha,
+                    lora_dropout,
+                    lora_merge_weights,
+                )
+                pass
+            else:
+                # TODO
+                pass
+        else:
+            if use_lora:
+                # TODO
+                pass
+            else:
+                # TODO
+                pass
+
+        self.use_lora = use_lora
+        self.lora_components = lora_components
+        self.lora_rank = lora_rank
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.lora_merge_weights = lora_merge_weights
+
+    def _make_lora_layers(
+        self,
+        lora_components: List[str],
+        lora_rank: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        lora_merge_weights: bool,
+    ):
+        counts = {k: 0 for k in lora_components}
+        self._recursive_replace_layer_by_lora(
+            self.hf_model,
+            counts,
+            lora_components,
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+            lora_merge_weights,
+        )
+        for k, v in counts.items():
+            logging.info("count of LoRA layers for %s = %d", k, v)
+            assert v > 0, f"did not make any {k} LoRA"
+
+    @staticmethod
+    def _recursive_replace_layer_by_lora(
+        model: nn.Module,
+        counts: dict,
+        lora_components: List[str],
+        lora_rank: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        lora_merge_weights: bool,
+    ):
+        for name, module in model.named_children():
+            if len(list(module.children())) > 0:
+                HFWav2VecBase._recursive_replace_layer_by_lora(
+                    module,
+                    counts,
+                    lora_components,
+                    lora_rank,
+                    lora_alpha,
+                    lora_dropout,
+                    lora_merge_weights,
+                )
+            if isinstance(module, nn.Linear) and name in lora_components:
+                lora_layer = LoRAFactory.create_from_pretrained(
+                    module,
+                    r=lora_rank,
+                    lora_alpha=lora_alpha,
+                    lora_dropout=lora_dropout,
+                    merge_weights=lora_merge_weights,
+                )
+                setattr(model, name, lora_layer)
+                counts[name] += 1
+
     def change_dropouts(self, **kwargs):
         pass  # needs to be overloaded
 
     def freeze_feature_encoder(self):
         self.hf_model.freeze_feature_encoder()
 
+    def freeze_except_lora(self, bias=None):
+        bias = "none" if bias is None else bias
+        from ...layers.lora import mark_only_lora_as_trainable
+
+        mark_only_lora_as_trainable(self.hf_model, bias=bias)
+
     def has_param_groups(self):
         return self.feat_extract_lr is not None or self.encoder_lr is not None
 
@@ -302,14 +446,14 @@ def _normalize(self, x, x_mask=None):
         """Normalizes the audio to have zero mean and unit variance."""
         if x_mask is None:
             x = x - x.mean(dim=1, keepdim=True)
-            std = torch.sqrt((x ** 2).mean(dim=1, keepdim=True) + 1e-7)
+            std = torch.sqrt((x**2).mean(dim=1, keepdim=True) + 1e-7)
             x = x / std
         else:
             x_mask = x_mask.to(dtype=x.dtype)
             x_samples = torch.mean(x_mask, dim=1, keepdim=True)
             x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples
-            x2_mean = torch.mean(x ** 2 * x_mask, dim=1, keepdim=True) / x_samples
-            std = torch.sqrt(x2_mean - x_mean ** 2 + 1e-7)
+            x2_mean = torch.mean(x**2 * x_mask, dim=1, keepdim=True) / x_samples
+            std = torch.sqrt(x2_mean - x_mean**2 + 1e-7)
             x = (x - x_mean) / std
         return x
 
@@ -544,14 +688,6 @@ def forward_long_impl(
             else scale_seq_lengths(x_lengths, max_out_length, max_in_length)
         )
         output["hidden_states_lengths"] = feat_lengths
-        # print(
-        #     "lens",
-        #     mol0,
-        #     max_out_length,
-        #     output.last_hidden_state.size(1),
-        #     output.hidden_states[0].size(1),
-        #     flush=True,
-        # )
         return output
 
     def get_config(self):
@@ -572,6 +708,14 @@ def get_config(self):
             "left_encoder_context": self.left_encoder_context,
             "right_encoder_context": self.right_encoder_context,
             "sample_frequency": self.sample_frequency,
+            "feat_extract_lr": self.feat_extract_lr,
+            "encoder_lr": self.encoder_lr,
+            "use_lora": self.use_lora,
+            "lora_components": self.lora_components,
+            "lora_rank": self.lora_rank,
+            "lora_alpha": self.lora_alpha,
+            "lora_dropout": self.lora_dropout,
+            "lora_merge_weights": self.lora_merge_weights,
         }
 
         base_config = super().get_config()
@@ -584,24 +728,78 @@ def save(self, file_path: str):
 
     @staticmethod
     def filter_args(**kwargs):
-        valid_args = (
-            "pretrained_model_path",
-            "normalize_input",
-            "use_input_attention_mask",
-            "cache_dir",
-            "force_download",
-            "resume_download",
-            "revision",
-            "drop_layers_gt",
-            "ignore_pretrained",
-            "override_dropouts",
-            "override_spec_augment",
-            "left_encoder_context",
-            "right_encoder_context",
-            "sample_frequency",
+        return filter_func_args(HFWav2VecBase.__init__, **kwargs)
+        # valid_args = (
+        #     "pretrained_model_path",
+        #     "normalize_input",
+        #     "use_input_attention_mask",
+        #     "cache_dir",
+        #     "force_download",
+        #     "resume_download",
+        #     "revision",
+        #     "drop_layers_gt",
+        #     "ignore_pretrained",
+        #     "override_dropouts",
+        #     "override_spec_augment",
+        #     "left_encoder_context",
+        #     "right_encoder_context",
+        #     "sample_frequency",
+        # )
+        # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+        # return args
+
+    @staticmethod
+    def _add_lr_args(parser):
+        parser.add_argument(
+            "--feat-extractor-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for conv feature extractor, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
+        parser.add_argument(
+            "--encoder-lr",
+            default=None,
+            type=float,
+            help=(
+                "lr for transformer encoder, it serves to set a lr "
+                "different than the global one."
+            ),
+        )
+
+    @staticmethod
+    def _add_lora_args(parser):
+        parser.add_argument(
+            "--use-lora",
+            default=False,
+            action=ActionYesNo,
+            help="use low-rank adapters",
+        )
+        parser.add_argument(
+            "--lora-components",
+            default=["q_proj", "v_proj"],
+            nargs="+",
+            choices=[
+                "k_proj",
+                "q_proj",
+                "v_proj",
+                "out_proj",
+                "intermediate_dense",
+                "output_dense",
+            ],
+            help="list of components where we apply LoRA, eg [Wq, Wv]",
+        )
+        parser.add_argument("--lora-rank", default=4, help="rank of LoRA")
+        parser.add_argument("--lora-alpha", default=1.0, help="scale for LoRA")
+        parser.add_argument("--lora-dropout", default=0.0, help="dropout rate for LoRA")
+        parser.add_argument(
+            "--lora-merge-weights",
+            default=True,
+            action=ActionYesNo,
+            help="lora weights are merged with the pretrained weights at inference.",
         )
-        args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
-        return args
 
     @staticmethod
     def add_class_args(parser, prefix=None, skip=set()):
@@ -703,36 +901,22 @@ def add_class_args(parser, prefix=None, skip=set()):
                 "when the signal is evaluated chunk by chunk."
             ),
         )
-        parser.add_argument(
-            "--feat-extractor-lr",
-            default=None,
-            type=float,
-            help=(
-                "lr for conv feature extractor, it serves to set a lr "
-                "different than the global one."
-            ),
-        )
-        parser.add_argument(
-            "--encoder-lr",
-            default=None,
-            type=float,
-            help=(
-                "lr for transformer encoder, it serves to set a lr "
-                "different than the global one."
-            ),
-        )
+
+        HFWav2VecBase._add_lr_args(parser)
+        HFWav2VecBase._add_lora_args(parser)
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        valid_args = (
-            "override_dropouts",
-            "override_spec_augment",
-        )
-        args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
-        return args
+        return filter_func_args(HFWav2VecBase.change_config, **kwargs)
+        # valid_args = (
+        #     "override_dropouts",
+        #     "override_spec_augment",
+        # )
+        # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
+        # return args
 
     @staticmethod
     def add_finetune_args(parser, prefix=None, skip=set()):
@@ -759,23 +943,13 @@ def add_finetune_args(parser, prefix=None, skip=set()):
             ),
         )
         parser.add_argument(
-            "--feat-extractor-lr",
-            default=None,
-            type=float,
-            help=(
-                "lr for conv feature extractor, it serves to set a lr "
-                "different than the global one."
-            ),
-        )
-        parser.add_argument(
-            "--encoder-lr",
-            default=None,
-            type=float,
-            help=(
-                "lr for transformer encoder, it serves to set a lr "
-                "different than the global one."
-            ),
+            "--override-lora",
+            default=False,
+            action=ActionYesNo,
+            help=("whether to change the config of LoRA layers in the model."),
         )
 
+        HFWav2VecBase._add_lr_args(parser)
+        HFWav2VecBase._add_lora_args(parser)
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py
index dd446576..51f0f37a 100644
--- a/hyperion/utils/dataset.py
+++ b/hyperion/utils/dataset.py
@@ -55,7 +55,6 @@ def __init__(
         sparse_trials: bool = False,
         table_sep: Optional[str] = None,
     ):
-
         if isinstance(segments, SegmentSet):
             self._segments = segments
             self._segments_path = None
@@ -82,10 +81,12 @@ def __init__(
             features, FeatureSet
         )
         self._enrollments, self._enrollments_paths = self._parse_dict_args(
-            enrollments, EnrollmentMap,
+            enrollments,
+            EnrollmentMap,
         )
         self._trials, self._trials_paths = self._parse_dict_args(
-            trials, (TrialKey, TrialNdx, SparseTrialKey),
+            trials,
+            (TrialKey, TrialNdx, SparseTrialKey),
         )
 
         self.sparse_trials = sparse_trials
@@ -711,7 +712,8 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]
             raise ValueError()
 
     def set_segments(
-        self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool,
+        self,
+        segments: Union[PathLike, SegmentSet],
     ):
         if isinstance(segments, (str, Path)):
             self._segments = None
@@ -723,7 +725,9 @@ def set_segments(
             raise ValueError()
 
     def set_recordings(
-        self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool,
+        self,
+        recordings: Union[PathLike, RecordingSet],
+        update_seg_durs: bool = False,
     ):
         if isinstance(recordings, (str, Path)):
             self._recordings = None
@@ -753,7 +757,9 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]):
             raise ValueError()
 
     def add_enrollments(
-        self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap],
+        self,
+        enrollments_name: str,
+        enrollments: Union[PathLike, EnrollmentMap],
     ):
         if self._enrollments is None:
             self._enrollments = {}
@@ -793,7 +799,9 @@ def remove_features(self, features_name: str):
         del self._features[features_name]
         del self._features_paths[features_name]
 
-    def remove_recordings(self,):
+    def remove_recordings(
+        self,
+    ):
         if self._recordings_path is not None:
             self._files_to_delete.append(self._recordings_path)
 
@@ -820,7 +828,8 @@ def remove_classes(self, classes_name: str):
         del self._classes_paths[classes_name]
 
     def remove_enrollments(
-        self, enrollments_name: str,
+        self,
+        enrollments_name: str,
     ):
         if self._enrollments_paths[enrollments_name] is not None:
             self._files_to_delete.append(self._enrollments_paths[enrollments_name])
@@ -829,7 +838,8 @@ def remove_enrollments(
         del self._enrollments_paths[enrollments_name]
 
     def remove_trials(
-        self, trials_name: str,
+        self,
+        trials_name: str,
     ):
         if self._trials_paths[trials_name] is not None:
             self._files_to_delete.append(self._trials_paths[trials_name])
@@ -981,14 +991,20 @@ def split_into_trials_and_cohort(
             segments_male = SegmentSet(segments[segments["gender"] == "m"])
             segments_female = SegmentSet(segments[segments["gender"] == "f"])
             trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort(
-                segments_male, num_tar_trials, num_trial_speakers, seed,
+                segments_male,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
             )
             (
                 trials_female,
                 enroll_female,
                 cohort_female,
             ) = self._split_into_trials_and_cohort(
-                segments_female, num_tar_trials, num_trial_speakers, seed,
+                segments_female,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
             )
             trials = TrialKey.merge([trials_male, trials_female])
             enroll = EnrollmentMap.cat([enroll_male, enroll_female])
@@ -996,7 +1012,10 @@ def split_into_trials_and_cohort(
         else:
             segments = self.segments()
             trials, enroll, cohort = self._split_into_trials_and_cohort(
-                segments, num_tar_trials, num_trial_speakers, seed,
+                segments,
+                num_tar_trials,
+                num_trial_speakers,
+                seed,
             )
 
         dataset_trials = self.clone()
@@ -1019,7 +1038,10 @@ def remove_short_segments(self, min_length: float, length_name: str = "duration"
         self.clean()
 
     def remove_classes_few_segments(
-        self, class_name: str, min_segs: int, rebuild_idx: bool = False,
+        self,
+        class_name: str,
+        min_segs: int,
+        rebuild_idx: bool = False,
     ):
         segments = self.segments()
         classes, counts = np.unique(segments[class_name], return_counts=True)
@@ -1082,7 +1104,10 @@ def _segments_split_joint_classes(
         return train_segs, val_segs
 
     def _segments_split_disjoint_classes(
-        self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator,
+        self,
+        val_prob: float,
+        disjoint_classes: List[str],
+        rng: np.random.Generator,
     ):
         segments = self.segments()
         classes = segments[disjoint_classes].apply("-".join, axis=1)
@@ -1165,15 +1190,24 @@ def split_train_val(
             train_segs, val_segs = self._segments_split(val_prob, rng)
         elif joint_classes is not None and disjoint_classes is None:
             train_segs, val_segs = self._segments_split_joint_classes(
-                val_prob, joint_classes, min_train_samples, rng,
+                val_prob,
+                joint_classes,
+                min_train_samples,
+                rng,
             )
         elif joint_classes is None and disjoint_classes is not None:
             train_segs, val_segs = self._segments_split_disjoint_classes(
-                val_prob, disjoint_classes, rng,
+                val_prob,
+                disjoint_classes,
+                rng,
             )
         else:
             train_segs, val_segs = self._segments_split_joint_and_disjoint_classes(
-                val_prob, joint_classes, disjoint_classes, min_train_samples, rng,
+                val_prob,
+                joint_classes,
+                disjoint_classes,
+                min_train_samples,
+                rng,
             )
 
         train_ds = self.clone()
diff --git a/requirements.txt b/requirements.txt
index c3410829..1e1aea9b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ memory_profiler
 gdown
 fairscale==0.4.4
 tensorboard>=2.5.0
-yapf
 jsonargparse>=3.5.0
 wandb>=0.10.30
 librosa>=0.8.1
@@ -22,3 +21,6 @@ twine
 wheel
 transformers>=4.16.2
 sentencepiece>=0.1.97
+loralib
+lhotse
+

From 71f629d94aa981ea39a87b1a9a0afe8ab257b2a5 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login01.cm.cluster>
Date: Tue, 12 Sep 2023 22:24:37 -0400
Subject: [PATCH 81/89] add lora into ASR (haven't tested)

---
 egs/commonvoice/v1/cmd.sh                     |  2 +-
 ...v2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml | 88 +++++++++++++++++++
 egs/commonvoice/v1/datapath.sh                |  2 +-
 .../config_pruned_transducer_v6.0_13langs.sh  | 44 ++++++++++
 .../wav2transducer/hf_wav2rnn_transducer.py   | 18 ++++
 5 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh

diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh
index cedd70f9..697d5219 100755
--- a/egs/commonvoice/v1/cmd.sh
+++ b/egs/commonvoice/v1/cmd.sh
@@ -18,7 +18,7 @@ if [ "$(hostname -d)" == "cm.gemini" ];then
     export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
     #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G"
     #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
-elif [ "$(hostname -d)" == "rockfish.cluster" ];then
+elif [ "$(hostname -d)" == "cm.cluster" ];then
     export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G"
     export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G"
     export cuda_eval_cmd="$train_cmd"
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml
new file mode 100644
index 00000000..54ccd48e
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml
@@ -0,0 +1,88 @@
+# for LoRA ASR
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.3
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+    use_lora: true
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-lora
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh
index 56b242ed..a1430c8b 100644
--- a/egs/commonvoice/v1/datapath.sh
+++ b/egs/commonvoice/v1/datapath.sh
@@ -9,7 +9,7 @@ if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
   musan_root=/export/corpora5/JHU/musan
   echo "Put your database paths here"
   exit 1
-elif [ "$(hostname --domain)" == "rockfish.cluster" ];then
+elif [ "$(hostname --domain)" == "cm.cluster" ];then
   commonvoice_root=/data/jvillal7/corpora/commonvoice
   musan_root=/data/jvillal7/corpora/musan
 elif [ "$(hostname --domain)" == "cm.gemini" ];then
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh
new file mode 100644
index 00000000..cce21f4c
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" 
+
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v6.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v6.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0015.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth
diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py
index 1d16675c..8fc59a3d 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py
@@ -226,6 +226,9 @@ def freeze_feat_fuser(self):
     def freeze_hf_feats(self):
         self.hf_feats.freeze()
 
+    def freeze_hf_except_lora(self, bias=None):
+        self.hf_feats.freeze_except_lora(bias)
+
     def freeze_hf_feature_encoder(self):
         self.hf_feats.freeze_feature_encoder()
 
@@ -247,6 +250,15 @@ def set_train_mode(self, mode):
         elif mode == "hf-feat-extractor-frozen":
             self.unfreeze()
             self.freeze_hf_feature_encoder()
+        elif mode == "hf-lora":
+            self.unfreeze()
+            self.freeze_hf_except_lora()
+        elif mode == "hf-all-bias-lora":
+            self.unfreeze()
+            self.freeze_hf_except_lora(bias="all")
+        elif mode == "hf-lora-with-bias":
+            self.unfreeze()
+            self.freeze_hf_except_lora(bias="lora_only")
         else:
             raise ValueError(f"invalid train_mode={mode}")
 
@@ -270,6 +282,9 @@ def _train(self, train_mode: str):
                 "ft-transducer-nograd",
                 "hf-feats-frozen-nograd",
                 "hf-feat-extractor-frozen",
+                "hf-lora",
+                "hf-all-bias-lora",
+                "hf-lora-with-bias",
         ]:
             self.hf_feats.train()
             self.transducer._train("full")
@@ -287,6 +302,9 @@ def valid_train_modes():
             "ft-transducer-nograd",
             "hf-feats-frozen-nograd",
             "hf-feat-extractor-frozen",
+            "hf-lora",
+            "hf-all-bias-lora",
+            "hf-lora-with-bias",
         ]
 
     @staticmethod

From a75610ee27acf2cd15ecc38151f5efff6fa09623 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Wed, 13 Sep 2023 10:59:46 -0400
Subject: [PATCH 82/89] vox2.1 working and lora

---
 egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml  | 13 ++--
 ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml | 13 ++--
 ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++--
 ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++--
 ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++--
 ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++--
 ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 13 ++--
 ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 13 ++--
 ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++--
 ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++--
 ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++--
 ...rge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml | 71 ++++++++++++++++++
 ...rge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml | 74 +++++++++++++++++++
 ...vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh | 55 ++++++++++++++
 hyperion/torch/layers/lora.py                 | 52 +++++++++++--
 hyperion/torch/models/xvectors/xvector.py     | 36 +--------
 hyperion/torch/tpm/hf/hf_wav2vec_base.py      | 22 +++++-
 22 files changed, 378 insertions(+), 140 deletions(-)
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml
 create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh

diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
index 4fdf8068..86f55073 100644
--- a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
+++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml
@@ -4,32 +4,31 @@ reverb_aug:
   rir_types: 
     smallroom:
       weight: 1
-      rir_path: scp:data/rirs_smallroom/rirs.scp
+      rir_path: csv:data/rirs_smallroom/rirs.csv
       rir_norm: max
     mediumroom:
       weight: 1
-      rir_path: scp:data/rirs_mediumroom/rirs.scp
+      rir_path: csv:data/rirs_mediumroom/rirs.csv
       rir_norm: max
     realroom:
       weight: 1
-      rir_path: scp:data/rirs_real/rirs.scp
+      rir_path: csv:data/rirs_real/rirs.csv
       rir_norm: max
 noise_aug:
   noise_prob: 0.7
   noise_types: 
     noise:
       weight: 1
-      noise_path: data/musan_noise_proc_audio/wav.scp
+      noise_path: data/musan_noise_proc_audio/recordings.csv
       min_snr: 0
       max_snr: 18
     music:
       weight: 1
-      noise_path: data/musan_music_proc_audio/wav.scp
+      noise_path: data/musan_music_proc_audio/recordings.csv
       min_snr: 3
       max_snr: 18
     babble:
       weight: 1
-      noise_path: data/musan_speech_babble/wav.scp
+      noise_path: data/musan_speech_babble/recordings.csv
       min_snr: 3
       max_snr: 18
-
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
index ad991124..ffd2f374 100644
--- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
index 0b1d0454..7dcc56ef 100644
--- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
index 254ff796..3f5c46bc 100644
--- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
index 52be6db5..9e1d0928 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
index bd3e7f86..0d0dc398 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
index 69a8322b..8504db9e 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model:
@@ -60,4 +60,5 @@ trainer:
   log_interval: 1000
   epochs: 8
   eff_batch_size: 512
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
index 3443591a..dda0c632 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -15,7 +15,7 @@ data:
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -25,11 +25,11 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -38,7 +38,7 @@ data:
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -70,4 +70,5 @@ trainer:
   log_interval: 1000
   epochs: 4
   eff_batch_size: 256
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
index abe5da6e..46ee7d18 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
index 7287188c..db36f8ee 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 64
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 64
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model:
@@ -60,4 +60,5 @@ trainer:
   log_interval: 1000
   epochs: 8
   eff_batch_size: 512
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
index 3443591a..dda0c632 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -15,7 +15,7 @@ data:
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -25,11 +25,11 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -38,7 +38,7 @@ data:
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -70,4 +70,5 @@ trainer:
   log_interval: 1000
   epochs: 4
   eff_batch_size: 256
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
index 69a8322b..8504db9e 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model:
@@ -60,4 +60,5 @@ trainer:
   log_interval: 1000
   epochs: 8
   eff_batch_size: 512
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
index 5e1260ad..ad56e80d 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -15,7 +15,7 @@ data:
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -25,11 +25,11 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -38,7 +38,7 @@ data:
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -70,4 +70,5 @@ trainer:
   log_interval: 1000
   epochs: 4
   eff_batch_size: 256
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
index 2addaa1e..40341a27 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 128
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model: wavlmlarge_ecapatdnn512x3_v2.0.yaml
@@ -55,5 +55,6 @@ trainer:
   log_interval: 1000
   epochs: 35
   eff_batch_size: 1024
+  target_key: speaker
   train_mode: hf-feats-frozen-nograd
  
\ No newline at end of file
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
index 69a8322b..8504db9e 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -14,17 +14,17 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -32,7 +32,7 @@ data:
       min_batch_size: 32
       max_chunk_length: 3.0
       min_chunk_length: 3.0
-      class_name: class_id
+      class_name: speaker
     data_loader:
       num_workers: 8
 model:
@@ -60,4 +60,5 @@ trainer:
   log_interval: 1000
   epochs: 8
   eff_batch_size: 512
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
index 5e1260ad..ad56e80d 100644
--- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml
@@ -2,11 +2,11 @@ data:
   train:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -15,7 +15,7 @@ data:
       max_chunk_length: 6.0
       min_chunk_length: 6.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -25,11 +25,11 @@ data:
   val:
     dataset:
       class_names:
-      - class_id
+      - speaker
       aug_cfgs:
       - conf/reverb_noise_aug.yaml
       return_segment_info:
-      - class_id
+      - speaker
       target_sample_freq: 16000
       wav_scale: 1
     sampler:
@@ -38,7 +38,7 @@ data:
       max_chunk_length: 3.0
       min_chunk_length: 3.0
       num_chunks_per_seg_epoch: 6
-      class_name: class_id
+      class_name: speaker
       weight_exponent: 0.5
       weight_mode: data-prior
       seg_weight_mode: data-prior
@@ -70,4 +70,5 @@ trainer:
   log_interval: 1000
   epochs: 4
   eff_batch_size: 256
+  target_key: speaker
   train_mode: full
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml
new file mode 100644
index 00000000..b5b9b6b6
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml
@@ -0,0 +1,71 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - speaker
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: speaker
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - speaker
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: seg_chunk_sampler
+      min_batch_size: 32
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      class_name: speaker
+    data_loader:
+      num_workers: 8
+model:
+  hf_feats:
+    override_lora: true
+    use_lora: true
+    lora_rank: 4
+    lora_components:
+    - q_proj
+    - v_proj
+  xvector:
+    cos_scale: 32.0
+    margin: 0.2
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 5e-2
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 5e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 8
+  eff_batch_size: 512
+  target_key: speaker
+  train_mode: hf-lora
diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml
new file mode 100644
index 00000000..a39445ff
--- /dev/null
+++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml
@@ -0,0 +1,74 @@
+data:
+  train:
+    dataset:
+      class_names:
+      - speaker
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 6.0
+      min_chunk_length: 6.0
+      num_chunks_per_seg_epoch: 6
+      class_name: speaker
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 8
+    data_loader:
+      num_workers: 8
+  val:
+    dataset:
+      class_names:
+      - speaker
+      aug_cfgs:
+      - conf/reverb_noise_aug.yaml
+      return_segment_info:
+      - speaker
+      target_sample_freq: 16000
+      wav_scale: 1
+    sampler:
+      sampler_type: class_weighted_random_seg_chunk_sampler
+      min_batch_size: 16
+      max_chunk_length: 3.0
+      min_chunk_length: 3.0
+      num_chunks_per_seg_epoch: 6
+      class_name: speaker
+      weight_exponent: 0.5
+      weight_mode: data-prior
+      seg_weight_mode: data-prior
+      num_hard_prototypes: 0
+    data_loader:
+      num_workers: 8
+model:
+  xvector:
+    cos_scale: 32.0
+    margin: 0.4
+    margin_warmup_epochs: 0
+    intertop_k: 5
+    intertop_margin: 0.1
+trainer: 
+  optim:
+    opt_type: sgd
+    lr: 2e-3
+    momentum: 0.9
+    weight_decay: 1e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 5000
+    hold_steps: 6000
+    min_lr: 1e-4
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  use_amp: true
+  log_interval: 1000
+  epochs: 4
+  eff_batch_size: 256
+  target_key: speaker
+  train_mode: hf-lora
diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh
new file mode 100644
index 00000000..96ef76c5
--- /dev/null
+++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh
@@ -0,0 +1,55 @@
+# Wav2Vec2 Multilingual 300M params
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=voxceleb2cat_train
+
+# x-vector cfg
+
+nnet_type=hf_wav2vec2resnet1d
+
+nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0035.pth
+
+nnet_s2_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml
+nnet_s2_args=""
+nnet_name=${hf_model_name}_loraqv_ecapatdnn512x3_v2.0
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0008.pth
+
+nnet_s3_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0004.pth
+
+# back-end
+do_plda=false
+do_snorm=true
+do_qmf=true
+do_voxsrc22=true
+
+plda_aug_config=conf/reverb_noise_aug.yaml
+plda_num_augs=0
+if [ $plda_num_augs -eq 0 ]; then
+    plda_data=voxceleb2cat_train
+else
+    plda_data=voxceleb2cat_train_augx${plda_num_augs}
+fi
+plda_type=splda
+lda_dim=200
+plda_y_dim=150
+plda_z_dim=200
+
diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py
index 1436caf5..18401669 100644
--- a/hyperion/torch/layers/lora.py
+++ b/hyperion/torch/layers/lora.py
@@ -7,7 +7,47 @@
 
 import loralib as lora
 import torch.nn as nn
-from loralib import *
+from loralib import mark_only_lora_as_trainable
+
+
+def repr_lora(self, str_base):
+    if isinstance(self.lora_dropout, nn.Dropout):
+        lora_dropout = self.lora_dropout.p
+    else:
+        lora_dropout = 0
+
+    str_lora = f", r={self.r}, alpha={self.lora_alpha}, dropout={lora_dropout}, merge_weights={self.merge_weights})"
+    return str_base[:-1] + str_lora
+
+
+class LinearLoRA(lora.Linear):
+    def __repr__(self):
+        str_base = super().__repr__()
+        return repr_lora(self, str_base)
+
+
+class EmbeddingLoRA(lora.Embedding):
+    def __repr__(self):
+        str_base = super().__repr__()
+        return repr_lora(self, str_base)
+
+
+class Conv1dLoRA(lora.Conv1d):
+    def __repr__(self):
+        str_base = super().__repr__()
+        return repr_lora(self, str_base)
+
+
+class Conv2dLoRA(lora.Conv2d):
+    def __repr__(self):
+        str_base = super().__repr__()
+        return repr_lora(self, str_base)
+
+
+class Conv3dLoRA(lora.Conv3d):
+    def __repr__(self):
+        str_base = super().__repr__()
+        return repr_lora(self, str_base)
 
 
 class LoRAFactory:
@@ -19,7 +59,7 @@ def create_from_pretrained(
         merge_weights: bool = True,
     ):
         if isinstance(layer, nn.Embedding):
-            lora_layer = lora.Embedding(
+            lora_layer = EmbeddingLoRA(
                 layer.num_embeddings,
                 layer.embedding_dim,
                 padding_idx=layer.padding_idx,
@@ -36,7 +76,7 @@ def create_from_pretrained(
 
         elif isinstance(layer, nn.Linear):
             bias = layer.bias is not None
-            lora_layer = lora.Linear(
+            lora_layer = LinearLoRA(
                 layer.in_features,
                 layer.out_features,
                 bias=bias,
@@ -51,11 +91,11 @@ def create_from_pretrained(
 
         elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
             if isinstance(layer, nn.Conv1d):
-                lora_class = lora.Conv1d
+                lora_class = Conv1dLoRA
             elif isinstance(layer, nn.Conv2d):
-                lora_class = lora.Conv2d
+                lora_class = Conv2dLoRA
             elif isinstance(layer, nn.Conv3d):
-                lora_class = lora.Conv3d
+                lora_class = Conv3dLoRA
 
             bias = layer.bias is not None
             lora_layer = lora_class(
diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py
index d67785d2..9ccd0d31 100644
--- a/hyperion/torch/models/xvectors/xvector.py
+++ b/hyperion/torch/models/xvectors/xvector.py
@@ -6,10 +6,9 @@
 from enum import Enum
 from typing import Optional
 
-from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
 
 from ....utils.misc import filter_func_args
 from ...layer_blocks import TDNNBlock
@@ -52,7 +51,6 @@ def __init__(
         in_feats=None,
         proj_feats=None,
     ):
-
         super().__init__()
 
         # encoder network
@@ -407,7 +405,6 @@ def extract_embed_slidwin(
         embed_layer=None,
         detach_chunks=False,
     ):
-
         if feat_frame_shift is not None:
             # assume win_length/shift are in secs, transform to frames
             # pass feat times from msecs to secs
@@ -464,7 +461,6 @@ def compute_slidwin_timestamps(
         feat_frame_shift=10,
         feat_snip_edges=False,
     ):
-
         P = self.compute_slidwin_left_padding(
             win_length,
             win_shift,
@@ -495,7 +491,6 @@ def compute_slidwin_left_padding(
         feat_frame_shift=10,
         feat_snip_edges=False,
     ):
-
         # pass feat times from msecs to secs
         feat_frame_shift = feat_frame_shift / 1000
         feat_frame_length = feat_frame_length / 1000
@@ -526,7 +521,6 @@ def compute_slidwin_left_padding(
         return P1 + P2
 
     def get_config(self):
-
         enc_cfg = self.encoder_net.get_config()
         pool_cfg = PF.get_config(self.pool_net)
 
@@ -694,42 +688,14 @@ def valid_train_modes():
 
     @staticmethod
     def filter_args(**kwargs):
-
         # get arguments for pooling
         pool_args = PF.filter_args(**kwargs["pool_net"])
         args = filter_func_args(ClassifHead.__init__, kwargs)
         args["pool_net"] = pool_args
         return args
 
-        # valid_args = (
-        #     "num_classes",
-        #     "embed_dim",
-        #     "num_embed_layers",
-        #     "hid_act",
-        #     "loss_type",
-        #     "cos_scale",
-        #     "margin",
-        #     "margin_warmup_epochs",
-        #     "intertop_k",
-        #     "intertop_margin",
-        #     "num_subcenters",
-        #     "use_norm",
-        #     "norm_before",
-        #     "in_feats",
-        #     "proj_feats",
-        #     "dropout_rate",
-        #     "norm_layer",
-        #     "head_norm_layer",
-        #     "head_use_in_norm",
-        # )
-        # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
-
-        # args["pool_net"] = pool_args
-        # return args
-
     @staticmethod
     def add_class_args(parser, prefix=None, skip=set()):
-
         if prefix is not None:
             outer_parser = parser
             parser = ArgumentParser(prog="")
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index 2c8d239f..a981d1ec 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -545,6 +545,24 @@ def forward_impl(
         """
         max_in_length = x.size(-1)
         x, x_mask = self._preprocess(x, x_lengths)
+        if ddp_get_rank() == 0:
+            lora_layer = self.hf_model.encoder.layers[0].attention.v_proj
+            # print(
+            #     "lora\nw=",
+            #     lora_layer.weight[:3, :3],
+            #     "\na=",
+            #     lora_layer.lora_A[:3, :3],
+            #     "\nb=",
+            #     lora_layer.lora_B[:3, :3],
+            #     "\n",
+            #     "merged=",
+            #     lora_layer.merged,
+            #     "training=",
+            #     lora_layer.training,
+            #     flush=True,
+            # )
+            assert self.training == lora_layer.training
+            assert self.training == (not lora_layer.merged)
         output = self.hf_model(
             x,
             x_mask,
@@ -728,7 +746,7 @@ def save(self, file_path: str):
 
     @staticmethod
     def filter_args(**kwargs):
-        return filter_func_args(HFWav2VecBase.__init__, **kwargs)
+        return filter_func_args(HFWav2VecBase.__init__, kwargs)
         # valid_args = (
         #     "pretrained_model_path",
         #     "normalize_input",
@@ -910,7 +928,7 @@ def add_class_args(parser, prefix=None, skip=set()):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        return filter_func_args(HFWav2VecBase.change_config, **kwargs)
+        return filter_func_args(HFWav2VecBase.change_config, kwargs)
         # valid_args = (
         #     "override_dropouts",
         #     "override_spec_augment",

From c23103ee406a833726516ff8ac35b3a06382e97e Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Wed, 13 Sep 2023 19:32:11 -0400
Subject: [PATCH 83/89] lora in wavlm and hubert

---
 hyperion/torch/tpm/hf/hf_hubert.py   | 25 ++++++++++++++++++++-----
 hyperion/torch/tpm/hf/hf_wav2vec2.py |  8 +++++++-
 hyperion/torch/tpm/hf/hf_wavlm.py    | 25 ++++++++++++++++++++-----
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py
index 2957e433..32355bf6 100644
--- a/hyperion/torch/tpm/hf/hf_hubert.py
+++ b/hyperion/torch/tpm/hf/hf_hubert.py
@@ -6,11 +6,10 @@
 import os
 from typing import Callable, List, Optional, Tuple, Union
 
-from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-from transformers import HubertConfig, HubertModel
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from transformers import HubertConfig, HubertModel
 
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
 from .hf_wav2vec_base import HFWav2VecBase
@@ -137,6 +136,12 @@ class HFHubert(HFWav2VecBase):
         sample_frequency: (`int`) waveform sample frequency used to train the model.
         feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
         encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
+        use_lora: use low-rank adapters
+        lora_components: list of components where we apply LoRA, eg [Wq, Wv]
+        lora_rank: rank of LoRA
+        lora_alpha: scale for LoRA
+        lora_dropout: dropout rate for LoRA
+        lora_merge_weights: lora weights are merged with the pretrained weights at inference.
     """
 
     def __init__(
@@ -186,8 +191,12 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
     ):
-
         super().__init__(
             pretrained_model_path=pretrained_model_path,
             normalize_input=normalize_input,
@@ -205,6 +214,12 @@ def __init__(
             sample_frequency=sample_frequency,
             feat_extract_lr=feat_extract_lr,
             encoder_lr=encoder_lr,
+            use_lora=use_lora,
+            lora_components=lora_components,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_merge_weights=lora_merge_weights,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
@@ -618,7 +633,7 @@ def add_class_args(parser, prefix=None, skip=set()):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        args_base = HFWav2VecBase.filter_args(**kwargs)
+        args_base = HFWav2VecBase.filter_finetune_args(**kwargs)
         valid_args = (
             "hidden_dropout",
             "activation_dropout",
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index dd5de2fe..bc98f460 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -149,6 +149,12 @@ class HFWav2Vec2(HFWav2VecBase):
         sample_frequency: (`int`) waveform sample frequency used to train the model.
         feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
         encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
+        use_lora: use low-rank adapters
+        lora_components: list of components where we apply LoRA, eg [Wq, Wv]
+        lora_rank: rank of LoRA
+        lora_alpha: scale for LoRA
+        lora_dropout: dropout rate for LoRA
+        lora_merge_weights: lora weights are merged with the pretrained weights at inference.
     """
 
     def __init__(
@@ -697,7 +703,7 @@ def add_class_args(parser, prefix=None, skip=set()):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        args_base = HFWav2VecBase.filter_args(**kwargs)
+        args_base = HFWav2VecBase.filter_finetune_args(**kwargs)
         valid_args = (
             "hidden_dropout",
             "activation_dropout",
diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py
index e1b67d81..400e6a8b 100644
--- a/hyperion/torch/tpm/hf/hf_wavlm.py
+++ b/hyperion/torch/tpm/hf/hf_wavlm.py
@@ -6,11 +6,10 @@
 import os
 from typing import Callable, List, Optional, Tuple, Union
 
-from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
-from transformers import WavLMConfig, WavLMModel
-
 import torch
 import torch.nn as nn
+from jsonargparse import ActionParser, ActionYesNo, ArgumentParser
+from transformers import WavLMConfig, WavLMModel
 
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
 from .hf_wav2vec_base import HFWav2VecBase
@@ -150,6 +149,12 @@ class HFWavLM(HFWav2VecBase):
         sample_frequency: (`int`) waveform sample frequency used to train the model.
         feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one.
         encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one.
+        use_lora: use low-rank adapters
+        lora_components: list of components where we apply LoRA, eg [Wq, Wv]
+        lora_rank: rank of LoRA
+        lora_alpha: scale for LoRA
+        lora_dropout: dropout rate for LoRA
+        lora_merge_weights: lora weights are merged with the pretrained weights at inference.
     """
 
     def __init__(
@@ -204,8 +209,12 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        lora_components: List[str] = ["q_proj", "v_proj"],
+        lora_rank: int = 4,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        lora_merge_weights: bool = True,
     ):
-
         super().__init__(
             pretrained_model_path=pretrained_model_path,
             normalize_input=normalize_input,
@@ -223,6 +232,12 @@ def __init__(
             sample_frequency=sample_frequency,
             feat_extract_lr=feat_extract_lr,
             encoder_lr=encoder_lr,
+            use_lora=use_lora,
+            lora_components=lora_components,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            lora_merge_weights=lora_merge_weights,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
@@ -687,7 +702,7 @@ def add_class_args(parser, prefix=None, skip=set()):
 
     @staticmethod
     def filter_finetune_args(**kwargs):
-        args_base = HFWav2VecBase.filter_args(**kwargs)
+        args_base = HFWav2VecBase.filter_finetune_args(**kwargs)
         valid_args = (
             "hidden_dropout",
             "activation_dropout",

From 81c540b1492ec7b42299f0ebb871f6af66d11304 Mon Sep 17 00:00:00 2001
From: Jesus Villalba <jesus.antonio.villalba@gmail.com>
Date: Fri, 15 Sep 2023 12:35:56 -0400
Subject: [PATCH 84/89] fix bug in w2v constructors with lora

---
 ...v2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh |  6 +++---
 egs/voxceleb/v2.1/run_006_extract_xvectors.sh   |  2 +-
 hyperion/torch/tpm/hf/hf_hubert.py              | 10 ++++++++++
 hyperion/torch/tpm/hf/hf_wav2vec2.py            |  9 +++++++++
 hyperion/torch/tpm/hf/hf_wav2vec_base.py        | 17 ++++-------------
 hyperion/torch/tpm/hf/hf_wavlm.py               | 10 ++++++++++
 6 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh
index 96ef76c5..1985b8e6 100644
--- a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh
+++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh
@@ -37,9 +37,9 @@ nnet_s3=$nnet_s3_dir/model_ep0004.pth
 
 # back-end
 do_plda=false
-do_snorm=true
-do_qmf=true
-do_voxsrc22=true
+#do_snorm=true
+#do_qmf=true
+#do_voxsrc22=true
 
 plda_aug_config=conf/reverb_noise_aug.yaml
 plda_num_augs=0
diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
index 2cfe27fe..72b019cd 100755
--- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
+++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh
@@ -17,7 +17,7 @@ xvec_chunk_length=120.0
 . $config_file
 
 if [ "$use_gpu" == "true" ];then
-  xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length"
+  xvec_args="--use-gpu --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length"
   xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G"
   num_gpus=1
 else
diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py
index 32355bf6..638bf561 100644
--- a/hyperion/torch/tpm/hf/hf_hubert.py
+++ b/hyperion/torch/tpm/hf/hf_hubert.py
@@ -191,6 +191,7 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        use_lora: bool = False,
         lora_components: List[str] = ["q_proj", "v_proj"],
         lora_rank: int = 4,
         lora_alpha: int = 1,
@@ -298,6 +299,15 @@ def __init__(
         if drop_layers_gt is not None:
             self.drop_upper_layers(drop_layers_gt)
 
+        if use_lora:
+            self._make_lora_layers(
+                lora_components,
+                lora_rank,
+                lora_alpha,
+                lora_dropout,
+                lora_merge_weights,
+            )
+
         self.ignore_pretrained = True
 
     @property
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index bc98f460..5b59d79a 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -322,6 +322,15 @@ def __init__(
         if drop_layers_gt is not None:
             self.drop_upper_layers(drop_layers_gt)
 
+        if use_lora:
+            self._make_lora_layers(
+                lora_components,
+                lora_rank,
+                lora_alpha,
+                lora_dropout,
+                lora_merge_weights,
+            )
+
         self.ignore_pretrained = True
 
     @property
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index a981d1ec..e0bcee1c 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -174,15 +174,6 @@ def __init__(
         self._frame_shift = None
         self.hf_model = None
 
-        if use_lora:
-            self._make_lora_layers(
-                lora_components,
-                lora_rank,
-                lora_alpha,
-                lora_dropout,
-                lora_merge_weights,
-            )
-
     def __deepcopy__(self, memo):
         """Reimplementation of deepcopy for Hugging Face models.
         The weight_norm in the Conv. Pos. Encoder of Wav2Vec models make the default deepcopy to fail.
@@ -545,8 +536,8 @@ def forward_impl(
         """
         max_in_length = x.size(-1)
         x, x_mask = self._preprocess(x, x_lengths)
-        if ddp_get_rank() == 0:
-            lora_layer = self.hf_model.encoder.layers[0].attention.v_proj
+        # if ddp_get_rank() == 0:
+        #     lora_layer = self.hf_model.encoder.layers[0].attention.v_proj
             # print(
             #     "lora\nw=",
             #     lora_layer.weight[:3, :3],
@@ -561,8 +552,8 @@ def forward_impl(
             #     lora_layer.training,
             #     flush=True,
             # )
-            assert self.training == lora_layer.training
-            assert self.training == (not lora_layer.merged)
+            # assert self.training == lora_layer.training
+            # assert self.training == (not lora_layer.merged)
         output = self.hf_model(
             x,
             x_mask,
diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py
index 400e6a8b..1db5fa23 100644
--- a/hyperion/torch/tpm/hf/hf_wavlm.py
+++ b/hyperion/torch/tpm/hf/hf_wavlm.py
@@ -209,6 +209,7 @@ def __init__(
         sample_frequency: int = 16000,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
+        use_lora: bool = False,
         lora_components: List[str] = ["q_proj", "v_proj"],
         lora_rank: int = 4,
         lora_alpha: int = 1,
@@ -321,6 +322,15 @@ def __init__(
         if drop_layers_gt is not None:
             self.drop_upper_layers(drop_layers_gt)
 
+        if use_lora:
+            self._make_lora_layers(
+                lora_components,
+                lora_rank,
+                lora_alpha,
+                lora_dropout,
+                lora_merge_weights,
+            )
+
         self.ignore_pretrained = True
 
     @property

From a54c963d8d40dbcb49604ef3febb909768e02790 Mon Sep 17 00:00:00 2001
From: ylu125 <ylu125@login03.cm.cluster>
Date: Sat, 23 Sep 2023 17:01:27 -0400
Subject: [PATCH 85/89] update default argument of lora_merge_weights to false

---
 hyperion/torch/layers/lora.py            |  2 +-
 hyperion/torch/tpm/hf/hf_wav2vec2.py     |  5 ++++-
 hyperion/torch/tpm/hf/hf_wav2vec_base.py | 10 +++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py
index 18401669..91279119 100644
--- a/hyperion/torch/layers/lora.py
+++ b/hyperion/torch/layers/lora.py
@@ -56,7 +56,7 @@ def create_from_pretrained(
         r: int = 8,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        merge_weights: bool = True,
+        merge_weights: bool = False,
     ):
         if isinstance(layer, nn.Embedding):
             lora_layer = EmbeddingLoRA(
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index 5b59d79a..901c5072 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -204,6 +204,7 @@ def __init__(
         ignore_pretrained: bool = False,
         override_dropouts: bool = False,
         override_spec_augment: bool = False,
+        override_lora: bool = False,
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
@@ -214,7 +215,7 @@ def __init__(
         lora_rank: int = 4,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        lora_merge_weights: bool = True,
+        lora_merge_weights: bool = False,
     ):
         super().__init__(
             pretrained_model_path=pretrained_model_path,
@@ -228,6 +229,7 @@ def __init__(
             ignore_pretrained=ignore_pretrained,
             override_dropouts=override_dropouts,
             override_spec_augment=override_spec_augment,
+            override_lora=override_lora,
             left_encoder_context=left_encoder_context,
             right_encoder_context=right_encoder_context,
             sample_frequency=sample_frequency,
@@ -269,6 +271,7 @@ def __init__(
             self.change_config(
                 override_dropouts=self.override_dropouts,
                 override_spec_augment=self.override_spec_augment,
+                override_lora=self.override_lora,
                 hidden_dropout=hidden_dropout,
                 activation_dropout=activation_dropout,
                 attention_dropout=attention_dropout,
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index e0bcee1c..21dbcd54 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -15,6 +15,7 @@
 
 from ....utils.misc import filter_func_args
 from ...layers import LoRAFactory
+import loralib as lora
 from ...torch_model import TorchModel
 from ...utils import scale_seq_lengths, seq_lengths_to_mask
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
@@ -77,6 +78,7 @@ def __init__(
         ignore_pretrained: bool = False,
         override_dropouts: bool = False,
         override_spec_augment: bool = False,
+        override_lora: bool = False,
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
@@ -87,7 +89,7 @@ def __init__(
         lora_rank: int = 4,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        lora_merge_weights: bool = True,
+        lora_merge_weights: bool = False,
     ):
         super().__init__()
         self.pretrained_model_path = pretrained_model_path
@@ -99,6 +101,7 @@ def __init__(
         self.ignore_pretrained = ignore_pretrained
         self.override_dropouts = override_dropouts
         self.override_spec_augment = override_spec_augment
+        self.override_lora = override_lora
         self.right_encoder_context = right_encoder_context
         self.left_encoder_context = left_encoder_context
         self.feat_extract_lr = feat_extract_lr
@@ -253,7 +256,7 @@ def change_config(
         lora_rank: int = 4,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        lora_merge_weights: bool = True,
+        lora_merge_weights: bool = False,
         **kwargs,
     ):
         if override_spec_augment:
@@ -304,7 +307,7 @@ def change_lora(
         lora_rank: int = 4,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        lora_merge_weights: bool = True,
+        lora_merge_weights: bool = False,
     ):
         if not self.use_lora:
             if use_lora:
@@ -714,6 +717,7 @@ def get_config(self):
             "ignore_pretrained": self.ignore_pretrained,
             "override_dropouts": self.override_dropouts,
             "override_spec_augment": self.override_spec_augment,
+            "override_lora": self.override_lora,
             "left_encoder_context": self.left_encoder_context,
             "right_encoder_context": self.right_encoder_context,
             "sample_frequency": self.sample_frequency,

From 6a72173026af1a7d57cb1cc0dfb99cd62ba2975c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-67-98.ec2.internal>
Date: Mon, 25 Sep 2023 00:42:36 +0000
Subject: [PATCH 86/89] update config for 4 langs experiment

---
 ...ase_rnnt_k2_pruned_4langs_stage1_v4.0.yaml | 87 +++++++++++++++++++
 .../config_pruned_transducer_v4.0_4langs.sh   | 46 ++++++++++
 2 files changed, 133 insertions(+)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml
new file mode 100644
index 00000000..465cfcdb
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml
@@ -0,0 +1,87 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 1.0
+      num_chunks_per_seg_epoch: 0.6
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 50
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+      reduction: mean
+  feat_fusion_method: weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.005
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+ 
\ No newline at end of file
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh
new file mode 100644
index 00000000..424c2649
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh
@@ -0,0 +1,46 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=4_langs_train_proc_audio
+dev_data=4_langs_dev_proc_audio
+
+test_data="tr_test_proc_audio fr_test_proc_audio de_test_proc_audio it_test_proc_audio" 
+
+
+lans="tr de fr it"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+# x-vector cfg
+
+nnet_type=hf_wav2vec2rnn_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_4_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s1
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s2
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0015.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s3
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0002.pth
+nnet_s3=$nnet_s3_dir/model_ep0005.pth

From e15b227c66e80cb69e600ddde9a0b56ef32bd389 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-65-70.ec2.internal>
Date: Sat, 30 Sep 2023 17:01:48 +0000
Subject: [PATCH 87/89] Add FiLM inside the Wav2vec2

---
 ...2base_rnnt_film_k2_pruned_stage3_v7.0.yaml |   98 +
 ...g_pruned_filmed_transducer_v7.0_13langs.sh |   44 +
 hyperion/torch/tpm/hf/hf_wav2vec2.py          |   19 +-
 hyperion/torch/tpm/hf/hf_wav2vec_base.py      |  164 +-
 .../tpm/hf/wav2vec2/modeling_wav2vec2.py      | 2477 +++++++++++++++++
 5 files changed, 2788 insertions(+), 14 deletions(-)
 create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml
 create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh
 create mode 100644 hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py

diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml
new file mode 100644
index 00000000..9ab275a6
--- /dev/null
+++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml
@@ -0,0 +1,98 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 4
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 20
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 4
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+    # override_condition: true
+    use_condition: true
+    condition_size: 128
+    condition_components: 
+    - attention
+    condition_type: "one-hot"
+  transducer:
+    decoder:
+      prune_range: 15
+      reduction: mean
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 128
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-fused-feature
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.0002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.6
+    decay_steps: 30000
+    hold_steps: 20000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: full
+
+ 
diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh
new file mode 100644
index 00000000..b101854c
--- /dev/null
+++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh
@@ -0,0 +1,44 @@
+# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3
+
+# hugging face model
+hf_model_name=wav2vec2xlsr300m
+
+#vad
+# vad_config=conf/vad_16k.yaml
+
+# x-vector training 
+nnet_data=13_langs_train_proc_audio
+dev_data=13_langs_dev_proc_audio
+test_data="en_test_proc_audio ca_test_proc_audio" 
+#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio
+lans="sl ga-IE cv br tr cy tt ca kab de fr it en"
+language=13_langs_weighted
+
+# bpe_model=data/13_langs_lang_bpe_4000/bpe.model
+bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model
+# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model
+
+
+
+nnet_type=hf_wav2vec2rnn_filmed_transducer
+
+nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml
+nnet_s1_args=""
+
+nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v7.0_13_langs_weighted_8000_bpe
+nnet_s1_name=$nnet_name.s3
+
+nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
+nnet_s1=$nnet_s1_dir/model_ep0015.pth
+
+nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v7.0.yaml
+nnet_s2_args=""
+nnet_s2_name=${nnet_name}.s4
+nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
+nnet_s2=$nnet_s2_dir/model_ep0005.pth
+
+nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v7.0.yaml
+nnet_s3_args=""
+nnet_s3_name=${nnet_name}.s5
+nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
+nnet_s3=$nnet_s3_dir/model_ep0011.pth
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py
index 901c5072..d2638acd 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec2.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py
@@ -13,7 +13,7 @@
 
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
 from .hf_wav2vec_base import HFWav2VecBase
-
+from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel
 
 class HFWav2Vec2(HFWav2VecBase):
     r"""This is wrapper over HuggingFace Wav2Vec2 model.
@@ -205,6 +205,7 @@ def __init__(
         override_dropouts: bool = False,
         override_spec_augment: bool = False,
         override_lora: bool = False,
+        override_condition: bool = False,
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
@@ -216,6 +217,10 @@ def __init__(
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         lora_merge_weights: bool = False,
+        use_condition: bool = False,
+        condition_size: int = 128,
+        condition_components: List[str] = ["attention"],
+        condition_type: str = "one-hot",
     ):
         super().__init__(
             pretrained_model_path=pretrained_model_path,
@@ -230,6 +235,7 @@ def __init__(
             override_dropouts=override_dropouts,
             override_spec_augment=override_spec_augment,
             override_lora=override_lora,
+            override_condition=override_condition,
             left_encoder_context=left_encoder_context,
             right_encoder_context=right_encoder_context,
             sample_frequency=sample_frequency,
@@ -241,6 +247,10 @@ def __init__(
             lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
             lora_merge_weights=lora_merge_weights,
+            use_condition=use_condition,
+            condition_size=condition_size,
+            condition_components=condition_components,
+            condition_type=condition_type,
         )
 
         if pretrained_model_path is not None and not ignore_pretrained:
@@ -272,6 +282,7 @@ def __init__(
                 override_dropouts=self.override_dropouts,
                 override_spec_augment=self.override_spec_augment,
                 override_lora=self.override_lora,
+                override_condition=self.override_condition,
                 hidden_dropout=hidden_dropout,
                 activation_dropout=activation_dropout,
                 attention_dropout=attention_dropout,
@@ -333,6 +344,12 @@ def __init__(
                 lora_dropout,
                 lora_merge_weights,
             )
+        if use_condition:
+            self._make_condition_layers(
+                condition_size,
+                condition_components,
+                condition_type,
+            )
 
         self.ignore_pretrained = True
 
diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
index 21dbcd54..9f799ded 100644
--- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py
+++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py
@@ -19,6 +19,7 @@
 from ...torch_model import TorchModel
 from ...utils import scale_seq_lengths, seq_lengths_to_mask
 from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs
+from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel
 
 
 class HFWav2VecBase(TorchModel):
@@ -79,6 +80,7 @@ def __init__(
         override_dropouts: bool = False,
         override_spec_augment: bool = False,
         override_lora: bool = False,
+        override_condition: bool = False,
         left_encoder_context: int = 16,
         right_encoder_context: int = 16,
         sample_frequency: int = 16000,
@@ -90,6 +92,10 @@ def __init__(
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         lora_merge_weights: bool = False,
+        use_condition: bool = False,
+        condition_size: int = 128,
+        condition_components: List[str] = ["attention"],
+        condition_type: str = "one-hot",
     ):
         super().__init__()
         self.pretrained_model_path = pretrained_model_path
@@ -102,6 +108,7 @@ def __init__(
         self.override_dropouts = override_dropouts
         self.override_spec_augment = override_spec_augment
         self.override_lora = override_lora
+        self.override_condition = override_condition
         self.right_encoder_context = right_encoder_context
         self.left_encoder_context = left_encoder_context
         self.feat_extract_lr = feat_extract_lr
@@ -112,6 +119,10 @@ def __init__(
         self.lora_alpha = lora_alpha
         self.lora_dropout = lora_dropout
         self.lora_merge_weights = lora_merge_weights
+        self.use_condition = use_condition
+        self.condition_size = condition_size
+        self.condition_components = condition_components
+        self.condition_type = condition_type
 
         if pretrained_model_path is not None and not ignore_pretrained:
             rank = ddp_get_rank()
@@ -249,14 +260,19 @@ def change_config(
         override_dropouts: bool,
         override_spec_augment: bool,
         override_lora: bool,
+        override_condition: bool,
         feat_extract_lr: Optional[float] = None,
         encoder_lr: Optional[float] = None,
         use_lora: bool = False,
+        use_condition: bool = False,
         lora_components: List[str] = ["q_proj", "v_proj"],
         lora_rank: int = 4,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         lora_merge_weights: bool = False,
+        condition_size: int = 128,
+        condition_components: List[str] = ["attention"],
+        condition_type: str = "one-hot",
         **kwargs,
     ):
         if override_spec_augment:
@@ -278,6 +294,15 @@ def change_config(
                 lora_merge_weights=lora_merge_weights,
             )
 
+        if override_condition:
+            logging.info(f"overriding Condition config")
+            self.change_condition(
+                use_condition=use_condition,
+                condition_size=condition_size,
+                condition_components=condition_components,
+                condition_type=condition_type,
+            )
+
         self.feat_extract_lr = feat_extract_lr
         self.encoder_lr = encoder_lr
 
@@ -391,6 +416,53 @@ def _recursive_replace_layer_by_lora(
                 setattr(model, name, lora_layer)
                 counts[name] += 1
 
+    def change_condition(self,
+        use_condition: bool = False,
+        condition_size: int = 128,
+        condition_components: List[str] = ["attention"],
+        condition_type: str = "one-hot",
+    ):
+        if not self.use_condition:
+            if use_condition:
+                self._make_condition_layers(
+                    condition_size,
+                    condition_components,
+                    condition_type,
+                )
+            else:
+                pass
+        else:
+            if use_condition:
+                pass
+            else:
+                pass
+        self.use_condition = use_condition
+        self.condition_size = condition_size
+        self.condition_components = condition_components
+        self.condition_type = condition_type
+
+    def _make_condition_layers(self,
+        condition_size: int,
+        condition_components: List[str],
+        condition_type: str,
+        ):
+        # TODO: copy weight from self.hf_model to self.hf_model_with_condition
+        config = self.hf_model.config
+        config.condition_size = condition_size
+        config.condition_components = condition_components
+        config.condition_type = condition_type
+        
+        hf_model_with_condition = Wav2Vec2CondModel(config)
+        self._copy_condition_weights(self.hf_model, hf_model_with_condition)
+        # TODO: make weight for the FiLM layers (0,1)
+        self.hf_model = hf_model_with_condition
+
+
+    def _copy_condition_weights(self, hf_model, hf_model_with_condition):
+        for name, param in hf_model.named_parameters():
+            if name in hf_model_with_condition.state_dict():
+                hf_model_with_condition.state_dict()[name].data.copy_(param.data)
+
     def change_dropouts(self, **kwargs):
         pass  # needs to be overloaded
 
@@ -466,6 +538,7 @@ def forward(
         self,
         x: torch.Tensor,
         x_lengths: Optional[torch.LongTensor] = None,
+        condition_features: Optional[torch.Tensor] = None,
         return_attentions: bool = False,
         return_hid_states: bool = False,
         chunk_length: float = 0,
@@ -496,11 +569,12 @@ def forward(
                 (tuple(torch.FloatTensor)).
         """
         if chunk_length == 0 or x.size(1) < chunk_length * self.sample_frequency:
-            return self.forward_impl(x, x_lengths, return_attentions, return_hid_states)
+            return self.forward_impl(x, x_lengths, condition_features, return_attentions, return_hid_states)
         else:
             return self.forward_long_impl(
                 x,
                 x_lengths,
+                condition_features,
                 return_attentions,
                 return_hid_states,
                 chunk_length,
@@ -511,6 +585,7 @@ def forward_impl(
         self,
         x: torch.Tensor,
         x_lengths: Optional[torch.LongTensor] = None,
+        condition_features: Optional[torch.Tensor] = None,
         return_attentions: bool = False,
         return_hid_states: bool = False,
     ):
@@ -557,12 +632,23 @@ def forward_impl(
             # )
             # assert self.training == lora_layer.training
             # assert self.training == (not lora_layer.merged)
-        output = self.hf_model(
-            x,
-            x_mask,
-            output_attentions=return_attentions,
-            output_hidden_states=return_hid_states,
-        )
+
+        if condition_features is not None:
+            output = self.hf_model(
+                x,
+                condition_features,
+                x_mask,
+                output_attentions=return_attentions,
+                output_hidden_states=return_hid_states,
+            )
+
+        else:
+            output = self.hf_model(
+                x,
+                x_mask,
+                output_attentions=return_attentions,
+                output_hidden_states=return_hid_states,
+            )
         max_out_length = output.last_hidden_state.size(1)
         feat_lengths = (
             None
@@ -577,6 +663,7 @@ def forward_long_impl(
         self,
         x: torch.Tensor,
         x_lengths: Optional[torch.LongTensor] = None,
+        condition_features: Optional[torch.Tensor] = None,
         return_attentions: bool = False,
         return_hid_states: bool = False,
         chunk_length: float = 120.0,
@@ -633,12 +720,21 @@ def forward_long_impl(
             stop_i = min(start + chunk_length + right_context, x.size(1))
             x_i = x[:, start_i:stop_i]
             x_mask_i = None if x_mask is None else x_mask[start_i:stop_i]
-            output_i = self.hf_model(
-                x_i,
-                x_mask_i,
-                output_attentions=return_attentions,
-                output_hidden_states=return_hid_states,
-            )
+            if condition_features is not None:
+                output_i = self.hf_model(
+                    x_i,
+                    x_mask_i,
+                    condition_features=condition_features,
+                    output_attentions=return_attentions,
+                    output_hidden_states=return_hid_states,
+                )
+            else:
+                output_i = self.hf_model(
+                    x_i,
+                    x_mask_i,
+                    output_attentions=return_attentions,
+                    output_hidden_states=return_hid_states,
+                )
 
             if i < num_chunks - 1:
                 start_out_i = max(
@@ -718,6 +814,7 @@ def get_config(self):
             "override_dropouts": self.override_dropouts,
             "override_spec_augment": self.override_spec_augment,
             "override_lora": self.override_lora,
+            "override_condition": self.override_condition,
             "left_encoder_context": self.left_encoder_context,
             "right_encoder_context": self.right_encoder_context,
             "sample_frequency": self.sample_frequency,
@@ -729,6 +826,10 @@ def get_config(self):
             "lora_alpha": self.lora_alpha,
             "lora_dropout": self.lora_dropout,
             "lora_merge_weights": self.lora_merge_weights,
+            "use_condition": self.use_condition,
+            "condition_size": self.condition_size,
+            "condition_components": self.condition_components,
+            "condition_type": self.condition_type,
         }
 
         base_config = super().get_config()
@@ -814,6 +915,34 @@ def _add_lora_args(parser):
             help="lora weights are merged with the pretrained weights at inference.",
         )
 
+    def _add_condition_args(parser):
+        parser.add_argument(
+            "--use-condition",
+            default=False,
+            action=ActionYesNo,
+            help="use condition",
+        )
+        parser.add_argument(
+            "--condition-size",
+            default=128,
+            type=int,
+            help="size of the condition",
+        )
+        parser.add_argument(
+            "--condition-components",
+            default=["attention"],
+            nargs="+",
+            choices=["attention"],
+            help="list of components where we apply condition, eg [attention]",
+        )
+        parser.add_argument(
+            "--condition-type",
+            default="one-hot",
+            choices=["one-hot", "learned"],
+            help="type of condition",
+        )
+
+
     @staticmethod
     def add_class_args(parser, prefix=None, skip=set()):
         if prefix is not None:
@@ -917,6 +1046,7 @@ def add_class_args(parser, prefix=None, skip=set()):
 
         HFWav2VecBase._add_lr_args(parser)
         HFWav2VecBase._add_lora_args(parser)
+        HFWav2VecBase._add_condition_args(parser)
 
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
@@ -962,7 +1092,15 @@ def add_finetune_args(parser, prefix=None, skip=set()):
             help=("whether to change the config of LoRA layers in the model."),
         )
 
+        parser.add_argument(
+            "--override-condition",
+            default=False,
+            action=ActionYesNo,
+            help=("whether to change the config of condition layers in the model."),
+        )
+
         HFWav2VecBase._add_lr_args(parser)
         HFWav2VecBase._add_lora_args(parser)
+        HFWav2VecBase._add_condition_args(parser)
         if prefix is not None:
             outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
diff --git a/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py
new file mode 100644
index 00000000..ceeda9a9
--- /dev/null
+++ b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py
@@ -0,0 +1,2477 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Wav2Vec2 model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.models.wav2vec2.modeling_wav2vec2 import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    cached_file,
+    is_safetensors_available,
+    logging,
+    replace_return_docstrings,
+)
+
+from transformers import Wav2Vec2Config
+
+from ....layer_blocks import FiLM
+
+WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
+WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
+
+if is_safetensors_available():
+    from safetensors.torch import load_file as safe_load_file
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
+# Audio class docstring
+_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 6.54
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.98
+
+
+WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/wav2vec2-base-960h",
+    "facebook/wav2vec2-large-960h",
+    "facebook/wav2vec2-large-960h-lv60",
+    "facebook/wav2vec2-large-960h-lv60-self",
+    # See all Wav2Vec2 models at https://huggingface.co/models?filter=wav2vec2
+]
+
+
+@dataclass
+class Wav2Vec2ForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: torch.FloatTensor = None
+    projected_quantized_states: torch.FloatTensor = None
+    codevector_perplexity: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+    mask_time_indices = (
+        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+class Wav2Vec2NoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2LayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2GroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2PositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2SamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Wav2Vec2FeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
+class Wav2Vec2FeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2
+class Wav2Vec2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Wav2Vec2FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2CondEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Wav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.condition_type = config.condition_type
+        self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear")
+
+
+    def forward(self, hidden_states, condition_features, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.condition_layer(hidden_states, condition_features)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2CondEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Wav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = Wav2Vec2AttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+        self.condition_type = config.condition_type
+        self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        condition_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.condition_layer(hidden_states, condition_features)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2CondEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2CondEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.tensor,
+        condition_features: torch.tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2CondEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [Wav2Vec2CondEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        condition_features,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2GumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class Wav2Vec2Adapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2AdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class Wav2Vec2AttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2Config
+    base_model_prefix = "wav2vec2"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2PositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2FeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Wav2Vec2CondEncoder, Wav2Vec2CondEncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
+            module.gradient_checkpointing = value
+
+    def _get_adapters(self):
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
+
+        adapter_weights = {}
+        for name, module in self.named_modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                for param_name, param in module.named_parameters():
+                    adapter_weights[".".join([name, param_name])] = param
+
+        if isinstance(self, Wav2Vec2ForCTC):
+            for name, param in self.lm_head.named_parameters():
+                adapter_weights[".".join(["lm_head", name])] = param
+
+        return adapter_weights
+
+    def init_adapter_layers(self):
+        """
+        (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
+        """
+        # init attention adapters
+        for module in self.modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                self._init_weights(module)
+
+        # init lm head
+        if isinstance(self, Wav2Vec2ForCTC):
+            self._init_weights(self.lm_head)
+
+    def load_adapter(self, target_lang: str, force_load=True, **kwargs):
+        r"""
+        Load a language adapter model from a pre-trained adapter model.
+
+        Parameters:
+            target_lang (`str`):
+                Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
+                adapter.<lang>.safetensors or adapter.<lang>.bin
+            force_load (`bool`, defaults to `True`):
+                Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        Examples:
+
+        ```python
+        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+        >>> ckpt = "facebook/mms-1b-all"
+        >>> processor = AutoProcessor.from_pretrained(ckpt)
+        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
+        >>> # set specific language
+        >>> processor.tokenizer.set_target_lang("spa")
+        >>> model.load_adapter("spa")
+        ```
+        """
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
+
+        if target_lang == self.target_lang and not force_load:
+            logger.warning(f"Adapter weights are already set to {target_lang}.")
+            return
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        model_path_or_id = self.config._name_or_path
+        state_dict = None
+
+        # 1. Let's first try loading a safetensors adapter weight
+        if use_safetensors is not False:
+            filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = safe_load_file(weight_path)
+
+            except EnvironmentError:
+                if use_safetensors:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                if use_safetensors:
+                    raise EnvironmentError(
+                        f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                        f" directory containing a file named {filepath}."
+                    )
+
+        # 2. If this didn't work let's try loading a PyTorch adapter weight
+        if state_dict is None:
+            filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = torch.load(weight_path, map_location="cpu")
+
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                    f" directory containing a file named {filepath}."
+                )
+
+        adapter_weights = self._get_adapters()
+        unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
+        missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
+
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.")
+        elif len(missing_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.")
+
+        # make sure now vocab size is correct
+        target_vocab_size = state_dict["lm_head.weight"].shape[0]
+        if target_vocab_size != self.config.vocab_size:
+            self.lm_head = nn.Linear(
+                self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype
+            )
+            self.config.vocab_size = target_vocab_size
+
+        # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
+        state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
+        self.load_state_dict(state_dict, strict=False)
+
+        # set target language corectly
+        self.target_lang = target_lang
+
+
+WAV_2_VEC_2_START_DOCSTRING = r"""
+    Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV_2_VEC_2_START_DOCSTRING,
+)
+class Wav2Vec2CondModel(Wav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+        self.feature_projection = Wav2Vec2FeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = Wav2Vec2CondEncoderStableLayerNorm(config)
+        else:
+            self.encoder = Wav2Vec2CondEncoder(config)
+
+        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        condition_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            condition_features=condition_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
+class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = Wav2Vec2GumbelVectorQuantizer(config)
+
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
+        >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Wav2Vec2ForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
+class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        warnings.warn(
+            "The class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.", FutureWarning
+        )
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2(
+            input_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return MaskedLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV_2_VEC_2_START_DOCSTRING,
+)
+class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    WAV_2_VEC_2_START_DOCSTRING,
+)
+class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV_2_VEC_2_START_DOCSTRING,
+)
+class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = nn.functional.unfold(
+            hidden_states,
+            (self.kernel_size, self.in_conv_dim),
+            stride=(1, self.in_conv_dim),
+            dilation=(self.dilation, 1),
+        )
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.kernel(hidden_states)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV_2_VEC_2_START_DOCSTRING,
+)
+class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
\ No newline at end of file

From 9022d8af75c030098477e797d9fce85edd4ea778 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-64-123.ec2.internal>
Date: Sat, 30 Sep 2023 19:08:07 +0000
Subject: [PATCH 88/89] update FiLM Wav2vec2

---
 .../wav2transducer/hf_wav2rnn_film_transducer.py     | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
index 77579c94..b0a0bfea 100644
--- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
+++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py
@@ -81,12 +81,12 @@ def _make_fuser(self):
                                         layer_dim,
                                         bias=False)
 
-    def _fuse_hid_feats(self, hid_feats, lang):
+    def _fuse_hid_feats(self, hid_feats, lang_condition):
         """Fuses the hidden features from the Wav2Vec model.
 
         Args:
           hid_feats: list of hidden features Tensors from Wav2Vec model.
-          lang: language id Tensor.
+          lang_condition: language condition Tensor.
 
         Returns:
           Tensor of fused features (batch, channels, time)
@@ -95,7 +95,6 @@ def _fuse_hid_feats(self, hid_feats, lang):
             # There is only one layer of features
             return hid_feats[0]
 
-        lang_condition = self.transducer.decoder.lang_embedding(lang)
         hid_feats = hid_feats[self.feat_fusion_start:]
         if self.feat_fusion_method == "film-weighted-avg":
             film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films)))
@@ -129,12 +128,17 @@ def forward_feats(self,
                       return_feat_layers=None,
                       chunk_length=0,
                       detach_chunks=False):
+
+
+        lang_condition = self.transducer.decoder.lang_embedding(lang)
+
         return_hid_states = (False if return_feat_layers is None
                              and self.feat_fusion_method == "last" else True)
         with self._hf_context:
             hf_output = self.hf_feats(
                 x,
                 x_lengths,
+                condition_features=lang_condition,
                 return_hid_states=return_hid_states,
                 chunk_length=chunk_length,
                 detach_chunks=detach_chunks,
@@ -142,7 +146,7 @@ def forward_feats(self,
         feat_lengths = hf_output["hidden_states_lengths"]
         if return_hid_states:
             hid_feats = hf_output["hidden_states"]
-            feats = self._fuse_hid_feats(hid_feats, lang)
+            feats = self._fuse_hid_feats(hid_feats, lang_condition)
         else:
             hid_feats = None
             feats = hf_output["last_hidden_state"]

From 27fffa03aaa69c59eb4fd21e653db48777efd609 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-75-109.ec2.internal>
Date: Sat, 7 Oct 2023 01:21:15 +0000
Subject: [PATCH 89/89] add charachter based model for ASR

---
 hyperion/bin/decode_wav2vec2rnn_transducer.py | 14 ++++++--
 hyperion/torch/data/audio_dataset.py          | 21 +++++++++---
 hyperion/torch/data/char_piece.py             | 34 +++++++++++++++++++
 3 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 hyperion/torch/data/char_piece.py

diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py
index 33aea8c3..b1af102b 100755
--- a/hyperion/bin/decode_wav2vec2rnn_transducer.py
+++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py
@@ -27,6 +27,7 @@
 from hyperion.io import SequentialAudioReader as AR
 from hyperion.np.augment import SpeechAugment
 from hyperion.torch import TorchModelLoader as TML
+from hyperion.torch.data.char_piece import CharPieceProcessor
 from hyperion.torch.models import HFWav2Vec2RNNTransducer
 from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search
 from hyperion.torch.narchs import AudioFeatsMVN as AF
@@ -133,9 +134,16 @@ def decode_transducer(
     device = init_device(use_gpu)
     model = load_model(model_path, device)
 
-    logging.info("bpe-model=%s", bpe_model)
-    sp = spm.SentencePieceProcessor()
-    sp.load(bpe_model)
+
+
+    if bpe_model.endswith(".txt"):
+        logging.info("loading char piece file %s", bpe_model)
+        sp = CharPieceProcessor()
+        sp.load(open(bpe_model).read().split())    
+    else:
+        logging.info("bpe-model=%s", bpe_model)
+        sp = spm.SentencePieceProcessor()
+        sp.load(bpe_model)
 
     infer_args = HFWav2Vec2RNNTransducer.filter_infer_args(**infer_args)
     logging.info(f"infer-args={infer_args}")
diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py
index 45526284..5e604e6a 100644
--- a/hyperion/torch/data/audio_dataset.py
+++ b/hyperion/torch/data/audio_dataset.py
@@ -25,7 +25,7 @@
 from ...utils.segment_set import SegmentSet
 from ...utils.text import read_text
 from ..torch_defs import floatstr_torch
-
+from .char_piece import CharPieceProcessor
 
 class AudioDataset(Dataset):
     def __init__(
@@ -115,10 +115,21 @@ def _load_legacy_durations(self, time_durs_file):
         ].class_id.values.astype(float, copy=False)
 
     def _load_bpe_model(self, bpe_model, is_val):
-        if self.rank == 0:
-            logging.info("loading bpe file %s", bpe_model)
-        self.sp = spm.SentencePieceProcessor()
-        self.sp.load(bpe_model)
+        # if bpe_model end with .txt, it is a char piece model
+        # if bpe_model end with .model, it is a sentence piece model
+        if bpe_model.endswith(".txt"):
+            if self.rank == 0:
+                logging.info("loading char piece file %s", bpe_model)
+            self.sp = CharPieceProcessor()
+            self.sp.load(open(bpe_model).read().split())    
+        else:
+            if self.rank == 0:
+                logging.info("loading bpe file %s", bpe_model)
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.load(bpe_model)
+
+
+
         blank_id = self.sp.piece_to_id("<blk>")
         vocab_size = self.sp.get_piece_size()
 
diff --git a/hyperion/torch/data/char_piece.py b/hyperion/torch/data/char_piece.py
new file mode 100644
index 00000000..43c07619
--- /dev/null
+++ b/hyperion/torch/data/char_piece.py
@@ -0,0 +1,34 @@
+import logging
+
+class CharPieceProcessor:
+    def __init__(self):
+        self.token2id = {}
+        self.id2token = {}
+        
+    def load(self, token_list):
+        for idx, token in enumerate(token_list):
+            self.token2id[token] = idx
+            self.id2token[idx] = token
+        logging.info("Loaded {} tokens".format(len(self.token2id)))
+        logging.info("First 10 tokens: {}".format(list(self.token2id.keys())[:10]))
+        return True
+
+
+    def piece_to_id(self, token):
+        return self.token2id.get(token, self.token2id["<unk>"])
+
+    def id_to_piece(self, idx):
+        return self.id2token.get(idx, "<unk>")
+
+    def encode_as_pieces(self, text):
+        return [char for char in text]
+
+    def encode(self, text, out_type=int):
+        assert out_type in [int]
+        return [self.piece_to_id(char) for char in text]
+
+    def decode(self, ids):
+        return ''.join([self.id_to_piece(idx) for idx in ids])
+
+    def get_piece_size(self):
+        return len(self.token2id)