hyperion-ml · neillu23 · Jan 23, 2023 · Feb 1, 2023 · Feb 1, 2023 · Feb 15, 2023
diff --git a/README.md b/README.md
@@ -26,13 +26,20 @@ The full API is described in the documentation page [https://hyperion-ml.readthe
 ### Prerequisites
 
     We use anaconda or miniconda, though you should be able to make it work in other python distributions
-    To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.:
+    To start, you should create a new enviroment and install PyTorch:
 ```
-conda create --name ${your_env} python=3.8
+conda create --name ${your_env} python=3.11
 conda activate ${your_env}
-conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 ```
-In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions.
+
+For systems with cuda 10.2 driver:
+```
+conda create --name ${your_env} python=3.10
+conda activate ${your_env}
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch
+```
+
 
 ### Installing Hyperion
 

diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh
@@ -0,0 +1,33 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+if [ "$(hostname -d)" == "cm.gemini" ];then
+    export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G"
+    #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G"
+    export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G"
+    export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
+    #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G"
+    #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
+elif [ "$(hostname -d)" == "cm.cluster" ];then
+    export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G"
+    export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G"
+    export cuda_eval_cmd="$train_cmd"
+else
+    export train_cmd="run.pl" 
+    export cuda_cmd="run.pl"
+    export cuda_eval_cmd="$train_cmd"
+    #export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " 
+    #export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G"
+    #export cuda_eval_cmd="$train_cmd"
+fi
+
diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf
@@ -0,0 +1,16 @@
+
+# Default configuration
+command sbatch --export=PATH
+#command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
+option mem=* --mem-per-cpu $0
+# option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 
+option gpu=* -p GPU-shared --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+#option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*'
+#option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
+
diff --git a/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]*
diff --git a/egs/commonvoice/v1/conf/coe_gpu_long.conf b/egs/commonvoice/v1/conf/coe_gpu_long.conf
@@ -0,0 +1,13 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]*
+
+
diff --git a/egs/commonvoice/v1/conf/coe_gpu_rtx.conf b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx 
diff --git a/egs/commonvoice/v1/conf/coe_gpu_short.conf b/egs/commonvoice/v1/conf/coe_gpu_short.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]*
+option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]*
diff --git a/egs/commonvoice/v1/conf/coe_gpu_v100.conf b/egs/commonvoice/v1/conf/coe_gpu_v100.conf
@@ -0,0 +1,11 @@
+
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V 
+option mem=* -l mem_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l num_proc=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l h_rt=100:00:00
+option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100
diff --git a/egs/commonvoice/v1/conf/infer.yaml b/egs/commonvoice/v1/conf/infer.yaml
@@ -0,0 +1,2 @@
+beam_width: 5
+decoding_method: time_sync_beam_search
diff --git a/egs/commonvoice/v1/conf/reverb_noise_aug.yaml b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml
@@ -0,0 +1,35 @@
+reverb_aug:
+  reverb_prob: 0.45
+  max_reverb_context: 0.5
+  rir_types: 
+    smallroom:
+      weight: 1
+      rir_path: scp:data/rirs_smallroom/rirs.scp
+      rir_norm: max
+    mediumroom:
+      weight: 1
+      rir_path: scp:data/rirs_mediumroom/rirs.scp
+      rir_norm: max
+    realroom:
+      weight: 1
+      rir_path: scp:data/rirs_real/rirs.scp
+      rir_norm: max
+noise_aug:
+  noise_prob: 0.7
+  noise_types: 
+    noise:
+      weight: 1
+      noise_path: data/musan_noise_proc_audio/wav.scp
+      min_snr: 0
+      max_snr: 18
+    music:
+      weight: 1
+      noise_path: data/musan_music_proc_audio/wav.scp
+      min_snr: 3
+      max_snr: 18
+    babble:
+      weight: 1
+      noise_path: data/musan_speech_babble/wav.scp
+      min_snr: 3
+      max_snr: 18
+
diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf
@@ -0,0 +1,15 @@
+# Default configuration                                                                                                                                                        
+command sbatch --export=PATH
+option name=* --job-name $0
+default time=24:00:00
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 
+option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml
@@ -0,0 +1,91 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.5
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm
+      joiner:
+        joiner_type: film_joiner
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+
diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml
@@ -0,0 +1,91 @@
+data:
+  train:
+    dataset:
+      wav_scale: 1
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: false
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 0.1
+
+    data_loader:
+      num_workers: 1
+  val:
+    dataset:
+      aug_cfgs: 
+        - conf/reverb_noise_aug.yaml
+      wav_scale: 1
+      return_segment_info:
+        - text
+        - language
+    sampler:
+      #sampler_type: 'bucketing_seg_sampler'
+      sampler_type: 'class_weighted_random_bucketing_seg_sampler'
+      max_batch_length: 40
+      max_audio_length: 15.
+      min_batch_size: 1
+      drop_last: true
+      # for class_weighted_random_bucketing_seg_sampler
+      base_sampler_type: class_weighted_seg_sampler
+      weight_mode: "data-prior"
+      class_name: "language"
+      weight_exponent: 0.3
+      num_chunks_per_seg_epoch: 1.0
+    data_loader:
+      num_workers: 1
+model: 
+  hf_feats:
+    pretrained_model_path: facebook/wav2vec2-xls-r-300m 
+  transducer:
+    decoder:
+      prune_range: 15
+      rnnt_loss: k2_pruned
+      simple_loss_scale: 0.2
+      condition_size: 256
+      predictor:
+        embed_dim: 1024
+        num_layers: 2
+        hid_feats: 512
+        embed_dropout_rate: 0.4
+        rnn_dropout_rate: 0.4
+        rnn_type: lstm_residual
+      joiner:
+        hid_feats: 512
+  feat_fusion_method: film-weighted-avg
+  feat_fusion_start: 2
+trainer:
+  optim:
+    opt_type: sgd
+    lr: 0.002
+    momentum: 0.9
+    weight_decay: 4e-4
+  lrsched:
+    lrsch_type: exp_lr
+    decay_rate: 0.5
+    decay_steps: 45000
+    hold_steps: 30000
+    min_lr: 4e-5
+    warmup_steps: 6000
+    update_lr_on_opt_step: true
+  grad_clip: 100
+  use_amp: true
+  log_interval: 1000
+  epochs: 120
+  # eff_batch_size: 1024
+  eff_batch_size: 128
+  train_mode: hf-feats-frozen-nograd
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		beam_width: 5
		decoding_method: time_sync_beam_search